common/json-schema: fix: handle non-capturing groups (?:...) in JSON schema pattern converter (#21124)
The regex-to-grammar converter in _visit_pattern() crashes with SIGSEGV
when a JSON schema "pattern" field contains a non-capturing group (?:...).
Root cause: when the parser sees '(' followed by '?', it pushes a warning
but does not advance past '?:'. The recursive transform() call then
interprets '?' as a quantifier and calls seq.back() on an empty vector,
causing undefined behavior.
This commonly occurs when serving OpenAI-compatible tool calls from
clients that include complex regex patterns in their JSON schemas (e.g.,
date validation patterns like ^(?:(?:\d\d[2468][048]|...)-02-29|...)$).
The fix:
- Skip '?:' after '(' to treat non-capturing groups as regular groups
- For unsupported syntax (?=, ?!, etc.), skip to matching ')' safely,
handling escaped characters to avoid miscounting parenthesis depth
- Adjust the ')' unbalanced-parentheses check using direct char
comparisons instead of substr
- Add test cases for non-capturing groups (C++ only, as the JS/Python
implementations do not yet support this syntax)
This commit is contained in:
@@ -416,15 +416,30 @@ private:
|
||||
i++;
|
||||
} else if (c == '(') {
|
||||
i++;
|
||||
if (i < length) {
|
||||
if (sub_pattern[i] == '?') {
|
||||
if (i < length && sub_pattern[i] == '?') {
|
||||
if (i + 1 < length && sub_pattern[i + 1] == ':') {
|
||||
i += 2; // skip "?:" for non-capturing group, treat as regular group
|
||||
} else {
|
||||
// lookahead/lookbehind (?=, ?!, ?<=, ?<!) - not supported
|
||||
_warnings.push_back("Unsupported pattern syntax");
|
||||
// skip to matching ')' to avoid UB on empty seq
|
||||
int depth = 1;
|
||||
while (i < length && depth > 0) {
|
||||
if (sub_pattern[i] == '\\' && i + 1 < length) {
|
||||
i += 2; // skip escaped character
|
||||
} else {
|
||||
if (sub_pattern[i] == '(') depth++;
|
||||
else if (sub_pattern[i] == ')') depth--;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
seq.emplace_back("(" + to_rule(transform()) + ")", false);
|
||||
} else if (c == ')') {
|
||||
i++;
|
||||
if (start > 0 && sub_pattern[start - 1] != '(') {
|
||||
if (start > 0 && sub_pattern[start - 1] != '(' && (start < 2 || sub_pattern[start - 2] != '?' || sub_pattern[start - 1] != ':')) {
|
||||
_errors.push_back("Unbalanced parentheses");
|
||||
}
|
||||
return join_seq();
|
||||
|
||||
Reference in New Issue
Block a user