vocab: fix Gemma4 tokenizer (#21343)

* seems to work

* fix case with new line

Co-authored-by: sayap <sokann@gmail.com>

* gemma 4: fix pre tok regex

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: sayap <sokann@gmail.com>
This commit is contained in:
Piotr Wilkin (ilintar)
2026-04-03 10:33:03 +02:00
committed by GitHub
parent 0c58ba3365
commit b069b10ab4
5 changed files with 69 additions and 9 deletions
+6 -2
View File
@@ -912,7 +912,7 @@ bool unicode_cpt_is_han(uint32_t cpt) {
return false;
}
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode) {
// unicode categories
static const std::map<std::string, int> k_ucat_enum = {
{ "\\p{N}", unicode_cpt_flags::NUMBER },
@@ -1099,5 +1099,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
start += offset;
}
return unicode_byte_encoding_process(bpe_words);
if (byte_encode) {
return unicode_byte_encoding_process(bpe_words);
}
return bpe_words;
}