vocab: fix Gemma4 tokenizer (#21343)

* seems to work * fix case with new line Co-authored-by: sayap <sokann@gmail.com> * gemma 4: fix pre tok regex --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> Co-authored-by: sayap <sokann@gmail.com>
2026-04-03 10:33:03 +02:00
parent 0c58ba3365
commit b069b10ab4
5 changed files with 69 additions and 9 deletions
@@ -912,7 +912,7 @@ bool unicode_cpt_is_han(uint32_t cpt) {
    return false;
 }

-std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
+std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode) {
    // unicode categories
    static const std::map<std::string, int> k_ucat_enum = {
        { "\\p{N}", unicode_cpt_flags::NUMBER },
@@ -1099,5 +1099,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
        start += offset;
    }

-    return unicode_byte_encoding_process(bpe_words);
+    if (byte_encode) {
+        return unicode_byte_encoding_process(bpe_words);
+    }
+
+    return bpe_words;
 }