Autoparser - complete refactoring of parser architecture (#18675)
* Autoparser - full single commit squish * Final pre-merge changes: minor fixes, Kimi 2.5 model parser
This commit is contained in:
committed by
GitHub
parent
34df42f7be
commit
566059a26b
@@ -27,11 +27,11 @@ static std::string build_repetition(const std::string & item_rule, int min_items
|
||||
if (separator_rule.empty()) {
|
||||
if (min_items == 1 && !has_max) {
|
||||
return item_rule + "+";
|
||||
} else if (min_items == 0 && !has_max) {
|
||||
return item_rule + "*";
|
||||
} else {
|
||||
return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
|
||||
}
|
||||
if (min_items == 0 && !has_max) {
|
||||
return item_rule + "*";
|
||||
}
|
||||
return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
|
||||
}
|
||||
|
||||
auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
|
||||
@@ -41,7 +41,7 @@ static std::string build_repetition(const std::string & item_rule, int min_items
|
||||
return result;
|
||||
}
|
||||
|
||||
static void _build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
|
||||
static void build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
|
||||
auto has_min = min_value != std::numeric_limits<int64_t>::min();
|
||||
auto has_max = max_value != std::numeric_limits<int64_t>::max();
|
||||
|
||||
@@ -128,14 +128,14 @@ static void _build_min_max_int(int64_t min_value, int64_t max_value, std::string
|
||||
if (has_min && has_max) {
|
||||
if (min_value < 0 && max_value < 0) {
|
||||
out << "\"-\" (";
|
||||
_build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
|
||||
build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
|
||||
out << ")";
|
||||
return;
|
||||
}
|
||||
|
||||
if (min_value < 0) {
|
||||
out << "\"-\" (";
|
||||
_build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
|
||||
build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
|
||||
out << ") | ";
|
||||
min_value = 0;
|
||||
}
|
||||
@@ -159,7 +159,7 @@ static void _build_min_max_int(int64_t min_value, int64_t max_value, std::string
|
||||
if (has_min) {
|
||||
if (min_value < 0) {
|
||||
out << "\"-\" (";
|
||||
_build_min_max_int(std::numeric_limits<int64_t>::min(), -min_value, out, decimals_left, /* top_level= */ false);
|
||||
build_min_max_int(std::numeric_limits<int64_t>::min(), -min_value, out, decimals_left, /* top_level= */ false);
|
||||
out << ") | [0] | [1-9] ";
|
||||
more_digits(0, decimals_left - 1);
|
||||
} else if (min_value == 0) {
|
||||
@@ -194,7 +194,7 @@ static void _build_min_max_int(int64_t min_value, int64_t max_value, std::string
|
||||
}
|
||||
digit_range(c, c);
|
||||
out << " (";
|
||||
_build_min_max_int(std::stoll(min_s.substr(1)), std::numeric_limits<int64_t>::max(), out, less_decimals, /* top_level= */ false);
|
||||
build_min_max_int(std::stoll(min_s.substr(1)), std::numeric_limits<int64_t>::max(), out, less_decimals, /* top_level= */ false);
|
||||
out << ")";
|
||||
if (c < '9') {
|
||||
out << " | ";
|
||||
@@ -213,10 +213,10 @@ static void _build_min_max_int(int64_t min_value, int64_t max_value, std::string
|
||||
more_digits(0, less_decimals);
|
||||
out << " | ";
|
||||
}
|
||||
_build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
|
||||
build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
|
||||
} else {
|
||||
out << "\"-\" (";
|
||||
_build_min_max_int(-max_value, std::numeric_limits<int64_t>::max(), out, decimals_left, /* top_level= */ false);
|
||||
build_min_max_int(-max_value, std::numeric_limits<int64_t>::max(), out, decimals_left, /* top_level= */ false);
|
||||
out << ")";
|
||||
}
|
||||
return;
|
||||
@@ -232,7 +232,7 @@ struct BuiltinRule {
|
||||
std::vector<std::string> deps;
|
||||
};
|
||||
|
||||
std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
|
||||
static std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
|
||||
{"boolean", {"(\"true\" | \"false\") space", {}}},
|
||||
{"decimal-part", {"[0-9]{1,16}", {}}},
|
||||
{"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
|
||||
@@ -247,7 +247,7 @@ std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
|
||||
{"null", {"\"null\" space", {}}},
|
||||
};
|
||||
|
||||
std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
|
||||
static std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
|
||||
{"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
|
||||
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
|
||||
{"date-time", {"date \"T\" time", {"date", "time"}}},
|
||||
@@ -260,22 +260,26 @@ static bool is_reserved_name(const std::string & name) {
|
||||
static const std::unordered_set<std::string> RESERVED_NAMES = [] {
|
||||
std::unordered_set<std::string> s;
|
||||
s.insert("root");
|
||||
for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
|
||||
for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
|
||||
for (const auto & p : PRIMITIVE_RULES) {
|
||||
s.insert(p.first);
|
||||
}
|
||||
for (const auto & p : STRING_FORMAT_RULES) {
|
||||
s.insert(p.first);
|
||||
}
|
||||
return s;
|
||||
}();
|
||||
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
|
||||
}
|
||||
|
||||
std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
|
||||
std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"\\\\]");
|
||||
std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
|
||||
std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
|
||||
static std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
|
||||
static std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"\\\\]");
|
||||
static std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
|
||||
static std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
|
||||
{'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}, {'\\', "\\\\"}
|
||||
};
|
||||
|
||||
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
|
||||
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
|
||||
static std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
|
||||
static std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
|
||||
|
||||
static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) {
|
||||
std::smatch match;
|
||||
@@ -322,19 +326,19 @@ private:
|
||||
if (_rules.find(esc_name) == _rules.end() || _rules[esc_name] == rule) {
|
||||
_rules[esc_name] = rule;
|
||||
return esc_name;
|
||||
} else {
|
||||
int i = 0;
|
||||
while (_rules.find(esc_name + std::to_string(i)) != _rules.end() && _rules[esc_name + std::to_string(i)] != rule) {
|
||||
i++;
|
||||
}
|
||||
std::string key = esc_name + std::to_string(i);
|
||||
_rules[key] = rule;
|
||||
return key;
|
||||
}
|
||||
int i = 0;
|
||||
while (_rules.find(esc_name + std::to_string(i)) != _rules.end() && _rules[esc_name + std::to_string(i)] != rule) {
|
||||
i++;
|
||||
}
|
||||
std::string key = esc_name + std::to_string(i);
|
||||
_rules[key] = rule;
|
||||
return key;
|
||||
}
|
||||
|
||||
std::string _generate_union_rule(const std::string & name, const std::vector<json> & alt_schemas) {
|
||||
std::vector<std::string> rules;
|
||||
rules.reserve(alt_schemas.size());
|
||||
for (size_t i = 0; i < alt_schemas.size(); i++) {
|
||||
rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
|
||||
}
|
||||
@@ -398,6 +402,7 @@ private:
|
||||
flush_literal();
|
||||
|
||||
std::vector<std::string> results;
|
||||
results.reserve(ret.size());
|
||||
for (const auto & item : ret) {
|
||||
results.push_back(to_rule(item));
|
||||
}
|
||||
@@ -551,7 +556,7 @@ private:
|
||||
TrieNode() : is_end_of_string(false) {}
|
||||
|
||||
void insert(const std::string & string) {
|
||||
auto node = this;
|
||||
auto *node = this;
|
||||
for (char c : string) {
|
||||
node = &node->children[c];
|
||||
}
|
||||
@@ -676,7 +681,7 @@ private:
|
||||
if (ks.empty()) {
|
||||
return res;
|
||||
}
|
||||
std::string k = ks[0];
|
||||
const std::string& k = ks[0];
|
||||
std::string kv_rule_name = prop_kv_rule_names[k];
|
||||
std::string comma_ref = "( \",\" space " + kv_rule_name + " )";
|
||||
if (first_is_optional) {
|
||||
@@ -779,7 +784,7 @@ public:
|
||||
std::string pointer = ref.substr(ref.find('#') + 1);
|
||||
std::vector<std::string> tokens = string_split(pointer, "/");
|
||||
for (size_t i = 1; i < tokens.size(); ++i) {
|
||||
std::string sel = tokens[i];
|
||||
const std::string& sel = tokens[i];
|
||||
if (target.is_object() && target.contains(sel)) {
|
||||
target = target[sel];
|
||||
} else if (target.is_array()) {
|
||||
@@ -802,7 +807,7 @@ public:
|
||||
_refs[ref] = target;
|
||||
}
|
||||
} else {
|
||||
for (auto & kv : n.items()) {
|
||||
for (const auto & kv : n.items()) {
|
||||
visit_refs(kv.value());
|
||||
}
|
||||
}
|
||||
@@ -812,7 +817,7 @@ public:
|
||||
visit_refs(schema);
|
||||
}
|
||||
|
||||
std::string _generate_constant_rule(const json & value) {
|
||||
static std::string _generate_constant_rule(const json & value) {
|
||||
return format_literal(value.dump());
|
||||
}
|
||||
|
||||
@@ -823,10 +828,12 @@ public:
|
||||
|
||||
if (schema.contains("$ref")) {
|
||||
return _add_rule(rule_name, _resolve_ref(schema["$ref"]));
|
||||
} else if (schema.contains("oneOf") || schema.contains("anyOf")) {
|
||||
}
|
||||
if (schema.contains("oneOf") || schema.contains("anyOf")) {
|
||||
std::vector<json> alt_schemas = schema.contains("oneOf") ? schema["oneOf"].get<std::vector<json>>() : schema["anyOf"].get<std::vector<json>>();
|
||||
return _add_rule(rule_name, _generate_union_rule(name, alt_schemas));
|
||||
} else if (schema_type.is_array()) {
|
||||
}
|
||||
if (schema_type.is_array()) {
|
||||
std::vector<json> schema_types;
|
||||
for (const auto & t : schema_type) {
|
||||
json schema_copy(schema);
|
||||
@@ -834,15 +841,18 @@ public:
|
||||
schema_types.push_back(schema_copy);
|
||||
}
|
||||
return _add_rule(rule_name, _generate_union_rule(name, schema_types));
|
||||
} else if (schema.contains("const")) {
|
||||
}
|
||||
if (schema.contains("const")) {
|
||||
return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
|
||||
} else if (schema.contains("enum")) {
|
||||
}
|
||||
if (schema.contains("enum")) {
|
||||
std::vector<std::string> enum_values;
|
||||
for (const auto & v : schema["enum"]) {
|
||||
enum_values.push_back(_generate_constant_rule(v));
|
||||
}
|
||||
return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
|
||||
} else if ((schema_type.is_null() || schema_type == "object")
|
||||
}
|
||||
if ((schema_type.is_null() || schema_type == "object")
|
||||
&& (schema.contains("properties") ||
|
||||
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
|
||||
std::unordered_set<std::string> required;
|
||||
@@ -863,11 +873,12 @@ public:
|
||||
_build_object_rule(
|
||||
properties, required, name,
|
||||
schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
|
||||
} else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
|
||||
}
|
||||
if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
|
||||
std::unordered_set<std::string> required;
|
||||
std::vector<std::pair<std::string, json>> properties;
|
||||
std::map<std::string, size_t> enum_values;
|
||||
std::string hybrid_name = name;
|
||||
const std::string& hybrid_name = name;
|
||||
std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
|
||||
if (comp_schema.contains("$ref")) {
|
||||
add_component(_refs[comp_schema["$ref"]], is_required);
|
||||
@@ -890,9 +901,9 @@ public:
|
||||
// todo warning
|
||||
}
|
||||
};
|
||||
for (auto & t : schema["allOf"]) {
|
||||
for (const auto & t : schema["allOf"]) {
|
||||
if (t.contains("anyOf")) {
|
||||
for (auto & tt : t["anyOf"]) {
|
||||
for (const auto & tt : t["anyOf"]) {
|
||||
add_component(tt, false);
|
||||
}
|
||||
} else {
|
||||
@@ -911,7 +922,8 @@ public:
|
||||
}
|
||||
}
|
||||
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
|
||||
} else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
|
||||
}
|
||||
if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
|
||||
json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
|
||||
if (items.is_array()) {
|
||||
std::string rule = "\"[\" space ";
|
||||
@@ -923,27 +935,31 @@ public:
|
||||
}
|
||||
rule += " \"]\" space";
|
||||
return _add_rule(rule_name, rule);
|
||||
} else {
|
||||
std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
|
||||
int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
|
||||
json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
|
||||
int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
|
||||
|
||||
return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
|
||||
}
|
||||
} else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
|
||||
std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
|
||||
int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
|
||||
json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
|
||||
int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
|
||||
|
||||
return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
|
||||
}
|
||||
if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
|
||||
return _visit_pattern(schema["pattern"], rule_name);
|
||||
} else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
|
||||
}
|
||||
if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
|
||||
return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
|
||||
} else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
|
||||
}
|
||||
if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
|
||||
auto prim_name = schema_format + "-string";
|
||||
return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
|
||||
} else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
|
||||
}
|
||||
if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
|
||||
std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
|
||||
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
|
||||
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
|
||||
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
|
||||
} else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
|
||||
}
|
||||
if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
|
||||
int64_t min_value = std::numeric_limits<int64_t>::min();
|
||||
int64_t max_value = std::numeric_limits<int64_t>::max();
|
||||
if (schema.contains("minimum")) {
|
||||
@@ -958,19 +974,24 @@ public:
|
||||
}
|
||||
std::stringstream out;
|
||||
out << "(";
|
||||
_build_min_max_int(min_value, max_value, out);
|
||||
build_min_max_int(min_value, max_value, out);
|
||||
out << ") space";
|
||||
return _add_rule(rule_name, out.str());
|
||||
} else if (schema.empty() || schema_type == "object") {
|
||||
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
|
||||
} else {
|
||||
if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
|
||||
_errors.push_back("Unrecognized schema: " + schema.dump());
|
||||
return "";
|
||||
}
|
||||
// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
|
||||
return _add_primitive(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
|
||||
}
|
||||
if (schema.empty() || schema_type == "object") {
|
||||
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
|
||||
}
|
||||
if (schema_type.is_null() && schema.is_object()) {
|
||||
// No type constraint and no recognized structural keywords (e.g. {"description": "..."}).
|
||||
// Per JSON Schema semantics this is equivalent to {} and accepts any value.
|
||||
return _add_rule(rule_name, _add_primitive("value", PRIMITIVE_RULES.at("value")));
|
||||
}
|
||||
if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
|
||||
_errors.push_back("Unrecognized schema: " + schema.dump());
|
||||
return "";
|
||||
}
|
||||
// TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
|
||||
return _add_primitive(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
|
||||
}
|
||||
|
||||
void check_errors() {
|
||||
@@ -985,7 +1006,7 @@ public:
|
||||
std::string format_grammar() {
|
||||
std::stringstream ss;
|
||||
for (const auto & kv : _rules) {
|
||||
ss << kv.first << " ::= " << kv.second << std::endl;
|
||||
ss << kv.first << " ::= " << kv.second << '\n';
|
||||
}
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user