From 7bfe60fdf929ae569b81bbbce7ff7be5a1f8e354 Mon Sep 17 00:00:00 2001 From: manayang Date: Wed, 22 Apr 2026 17:58:43 +0800 Subject: [PATCH] mtmd, llama : Update HunyuanVL vision-language model support (#22037) * mtmd, llama : add HunyuanVL vision-language model support - add LLM_ARCH_HUNYUAN_VL with M-RoPE (XD-RoPE) support - add PROJECTOR_TYPE_HUNYUANVL with PatchMerger vision encoder - add HunyuanVL-specific M-RoPE position encoding for image tokens - add GGUF conversion for HunyuanVL vision and text models - add smoke test in tools/mtmd/tests.sh * fix: fix HunyuanVL XD-RoPE h/w section order * fix: Remove redundant code * convert : fix HunyuanOCR / HunyuanVL conversion - Tested locally: both HunyuanOCR and HunyuanVL-4B convert to GGUF - successfully and produce correct inference output on Metal (F16 / Q8_0). * clip : fix -Werror=misleading-indentation in bilinear resize * fix CI: convert_hf_to_gguf type check error - convert_hf_to_gguf.py: give HunyuanVLTextModel.__init__ an explicit `dir_model: Path` parameter so ty can infer the type for load_hparams instead of reporting `Unknown | None`. --------- Co-authored-by: wendadawen --- convert_hf_to_gguf.py | 107 +++++++++++++++++++++++++++---- gguf-py/gguf/constants.py | 20 ++++++ gguf-py/gguf/gguf_writer.py | 3 + src/llama-arch.cpp | 2 + src/llama-arch.h | 2 + src/llama-hparams.h | 1 + src/llama-model.cpp | 22 +++++++ src/models/hunyuan-dense.cpp | 41 ++++++++---- tools/mtmd/clip-impl.h | 4 +- tools/mtmd/clip.cpp | 80 +++++++++++++++++++++++ tools/mtmd/models/hunyuanocr.cpp | 16 ++++- tools/mtmd/mtmd.cpp | 64 +++++++++++++++++- tools/mtmd/tests.sh | 1 + 13 files changed, 336 insertions(+), 27 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5b4fb79fc..090686b15 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -11855,7 +11855,7 @@ class LLaDAMoEModel(TextModel): raise ValueError(f"Unprocessed experts: {experts}") -@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration") +@ModelBase.register("HunYuanDenseV1ForCausalLM") class HunYuanModel(TextModel): model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE @@ -11994,28 +11994,58 @@ class HunYuanModel(TextModel): @ModelBase.register("HunYuanVLForConditionalGeneration") -class HunyuanOCRVisionModel(MmprojModel): +class HunyuanVLVisionModel(MmprojModel): + # Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name + # "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout. + # Each variant maps to a different projector type in clip.cpp so image + # preprocessing follows the correct code path. + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) assert self.hparams_vision is not None - # HunyuanOCR uses max_image_size instead of image_size + # HunyuanOCR / HunyuanVL uses max_image_size instead of image_size if "image_size" not in self.hparams_vision: self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048) + @staticmethod + def is_ocr_variant(hparams: dict) -> bool: + """Return True for HunyuanOCR, False for HunyuanVL. + + The projector's output dim must equal the text model's hidden_size by + construction (that's what "projector" means). HunyuanOCR pairs a 1B text + backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the + ViT -> LLM projection dim is a hard architectural signature, not a + magic number. + """ + vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) + return vision_out == 1024 + def set_gguf_parameters(self): super().set_gguf_parameters() assert self.hparams_vision is not None - hparams = self.hparams_vision - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR) - self.gguf_writer.add_vision_use_gelu(True) - self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5)) - self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2)) - self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) - self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) + vcfg = self.hparams_vision + + if self.is_ocr_variant(self.global_config): + # --- HunyuanOCR --- + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5)) + self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2)) + self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) + self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) + return + + # --- HunyuanVL --- + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL) + self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu") + self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"])) + self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"])) + self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"])) + self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"])) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if not name.startswith("vit."): - return # skip text tensors + return # strip CLS token (row 0) from position embeddings so resize_position_embeddings works if "position_embedding" in name: data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd] @@ -12023,11 +12053,66 @@ class HunyuanOCRVisionModel(MmprojModel): def tensor_force_quant(self, name, new_name, bid, n_dims): # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal + # Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2. if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"): return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 return super().tensor_force_quant(name, new_name, bid, n_dims) +@ModelBase.register("HunYuanVLForConditionalGeneration") +class HunyuanVLTextModel(HunYuanModel): + # The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR + # and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE), + # while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from + # the config and pick the matching GGUF architecture. + model_arch = gguf.MODEL_ARCH.HUNYUAN_VL + + @staticmethod + def _is_ocr_config(hparams: dict) -> bool: + # OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that + # outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with + # HunyuanVLVisionModel.is_ocr_variant. + return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024 + + def __init__(self, dir_model: Path, *args, **kwargs): + raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False) + if self._is_ocr_config(raw_hparams): + self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE + else: + self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL + super().__init__(dir_model, *args, **kwargs) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses + # the HunYuan-Dense arch which already handles standard rope in super(). + if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL: + return + + if self.rope_parameters.get("rope_type") != "xdrope": + return + + # defaults for HunyuanVL. The C++ side later computes: + # freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2)) + self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"])) + self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"])) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1))) + + ctx_len = int(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len) + self.gguf_writer.add_context_length(ctx_len) + + self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"])) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Skip vision tensors — they are written by HunyuanVLVisionModel + if name.startswith("vit."): + return + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("SmolLM3ForCausalLM") class SmolLM3Model(LlamaModel): model_arch = gguf.MODEL_ARCH.SMOLLM3 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c5297a2f4..83ae51ce9 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -197,6 +197,7 @@ class Keys: FREQ_BASE_SWA = "{arch}.rope.freq_base_swa" SCALING_TYPE = "{arch}.rope.scaling.type" SCALING_FACTOR = "{arch}.rope.scaling.factor" + SCALING_ALPHA = "{arch}.rope.scaling.alpha" SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" @@ -471,6 +472,7 @@ class MODEL_ARCH(IntEnum): ERNIE4_5_MOE = auto() HUNYUAN_MOE = auto() HUNYUAN_DENSE = auto() + HUNYUAN_VL = auto() SMOLLM3 = auto() GPT_OSS = auto() LFM2 = auto() @@ -957,6 +959,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.FALCON_H1: "falcon-h1", MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe", MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense", + MODEL_ARCH.HUNYUAN_VL: "hunyuan_vl", MODEL_ARCH.SMOLLM3: "smollm3", MODEL_ARCH.GPT_OSS: "gpt-oss", MODEL_ARCH.LFM2: "lfm2", @@ -3489,6 +3492,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.HUNYUAN_VL: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.SMOLLM3: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -4138,6 +4157,7 @@ class VisionProjectorType: YOUTUVL = "youtuvl" NEMOTRON_V2_VL = "nemotron_v2_vl" HUNYUANOCR = "hunyuanocr" + HUNYUANVL = "hunyuanvl" # Items here are (block size, type size) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 90d500dc7..6a81ca37d 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -973,6 +973,9 @@ class GGUFWriter: def add_rope_scaling_factor(self, value: float) -> None: self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value) + def add_rope_scaling_alpha(self, value: float) -> None: + self.add_float32(Keys.Rope.SCALING_ALPHA.format(arch=self.arch), value) + def add_rope_scaling_attn_factors(self, value: float) -> None: self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 6904b9c1a..633a66fc6 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -109,6 +109,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" }, { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" }, { LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" }, + { LLM_ARCH_HUNYUAN_VL, "hunyuan_vl" }, { LLM_ARCH_SMOLLM3, "smollm3" }, { LLM_ARCH_OPENAI_MOE, "gpt-oss" }, { LLM_ARCH_LFM2, "lfm2" }, @@ -250,6 +251,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ALPHA, "%s.rope.scaling.alpha" }, { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index c4aabab7e..8f335f5c7 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -113,6 +113,7 @@ enum llm_arch { LLM_ARCH_ERNIE4_5_MOE, LLM_ARCH_HUNYUAN_MOE, LLM_ARCH_HUNYUAN_DENSE, + LLM_ARCH_HUNYUAN_VL, LLM_ARCH_SMOLLM3, LLM_ARCH_OPENAI_MOE, LLM_ARCH_LFM2, @@ -254,6 +255,7 @@ enum llm_kv { LLM_KV_ROPE_SCALE_LINEAR, LLM_KV_ROPE_SCALING_TYPE, LLM_KV_ROPE_SCALING_FACTOR, + LLM_KV_ROPE_SCALING_ALPHA, LLM_KV_ROPE_SCALING_ATTN_FACTOR, LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, LLM_KV_ROPE_SCALING_FINETUNED, diff --git a/src/llama-hparams.h b/src/llama-hparams.h index c2000c77c..ac7f9ee86 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -116,6 +116,7 @@ struct llama_hparams { float rope_freq_base_train_swa = 10000.0f; float rope_freq_scale_train; float rope_freq_scale_train_swa = 1.0f; + float rope_scaling_alpha = 0.0f; // NTK-aware alpha for XDRoPE uint32_t n_ctx_orig_yarn; float rope_yarn_log_mul = 0.0f; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f77b2e921..9e2a13cbd 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -737,6 +737,13 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false); ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false); + if (arch == LLM_ARCH_HUNYUAN_VL || arch == LLM_ARCH_HUNYUAN_DENSE) { + if (hparams.n_expert <= 1) { + hparams.n_expert = 0; + hparams.n_expert_used = 0; + } + } + if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd); ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl); @@ -815,6 +822,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false); + ml.get_key(LLM_KV_ROPE_SCALING_ALPHA, hparams.rope_scaling_alpha, false); // non-transformer models do not have attention heads if (hparams.n_head() > 0) { @@ -2592,9 +2600,18 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_HUNYUAN_VL: case LLM_ARCH_HUNYUAN_DENSE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); + + // XDRoPE / NTK-aware scaling: base = rope_theta * alpha^(dim / (dim - 2)) + if (hparams.rope_scaling_alpha > 0.0f) { + const int dim = hparams.n_embd_head_k(); + hparams.rope_freq_base_train = hparams.rope_freq_base_train + * powf(hparams.rope_scaling_alpha, (float)dim / (float)(dim - 2)); + } switch (hparams.n_embd) { case 1024: type = LLM_TYPE_0_5B; break; @@ -6947,6 +6964,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0); } } break; + case LLM_ARCH_HUNYUAN_VL: case LLM_ARCH_HUNYUAN_DENSE: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -8967,6 +8985,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_HUNYUAN_VL: case LLM_ARCH_HUNYUAN_DENSE: { llm = std::make_unique(*this, params); @@ -9316,6 +9335,9 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GLM4_MOE: return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX; + case LLM_ARCH_HUNYUAN_VL: + return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX; + // all model arches should be listed explicitly here case LLM_ARCH_UNKNOWN: GGML_ABORT("unknown architecture"); diff --git a/src/models/hunyuan-dense.cpp b/src/models/hunyuan-dense.cpp index e4e837eb4..1cd85d6d9 100644 --- a/src/models/hunyuan-dense.cpp +++ b/src/models/hunyuan-dense.cpp @@ -6,6 +6,11 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); GGML_ASSERT(n_embd_head == n_rot); + const bool use_mrope = hparams.use_mrope(); + + int sections[4]; + std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); + ggml_tensor * cur; ggml_tensor * inpL; @@ -37,22 +42,36 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur, n_embd_head, n_head, n_head_kv, il); - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); + if (use_mrope) { + Qcur = ggml_rope_multi( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_multi( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + } else { + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 61fe82439..7d6484eea 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -150,7 +150,7 @@ #define TN_TOK_BOI "v.boi" #define TN_TOK_EOI "v.eoi" -// hunyuanocr +// hunyuanocr / hunyuanvl (shared GGUF tensor names) #define TN_MM_PRE_NORM "mm.pre_norm.%s" #define TN_TOK_IMG_BEGIN "mm.image_begin" #define TN_TOK_IMG_END "mm.image_end" @@ -303,6 +303,7 @@ enum projector_type { PROJECTOR_TYPE_KIMIK25, PROJECTOR_TYPE_NEMOTRON_V2_VL, PROJECTOR_TYPE_HUNYUANOCR, + PROJECTOR_TYPE_HUNYUANVL, PROJECTOR_TYPE_UNKNOWN, }; @@ -349,6 +350,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_KIMIK25, "kimik25"}, { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"}, { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"}, + { PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 540b0ea41..45e39898d 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -912,6 +912,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: { builder = std::make_unique(ctx, img); } break; @@ -1473,6 +1474,16 @@ struct clip_model_loader { get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels); hparams.set_warmup_n_tokens(28*28); } break; + case PROJECTOR_TYPE_HUNYUANVL: + { + hparams.n_merge = 2; + hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW; + hparams.image_resize_pad = false; + hparams.ffn_op = FFN_GELU; + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + hparams.set_limit_image_tokens(256, 16384); + hparams.set_warmup_n_tokens(32*32); + } break; case PROJECTOR_TYPE_LFM2A: { // audio preprocessing params @@ -2222,6 +2233,7 @@ struct clip_model_loader { model.mm_eoi = get_tensor(TN_TOK_EOI); } break; case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: { // proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear) model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); @@ -2860,6 +2872,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_PADDLEOCR: case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: case PROJECTOR_TYPE_YOUTUVL: return (img->nx / params.patch_size) / 2; case PROJECTOR_TYPE_STEP3VL: @@ -2879,6 +2892,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * case PROJECTOR_TYPE_QWEN3VL: case PROJECTOR_TYPE_GLM4V: case PROJECTOR_TYPE_PADDLEOCR: + case PROJECTOR_TYPE_HUNYUANVL: case PROJECTOR_TYPE_YOUTUVL: return (img->ny / params.patch_size) / 2; case PROJECTOR_TYPE_STEP3VL: @@ -3070,6 +3084,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches = h * (h + 1) + 1; } break; case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: { int merge = ctx->model.hparams.n_merge; int ow = (img->nx / patch_size) / merge; @@ -3534,6 +3549,70 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima { // do nothing } break; + case PROJECTOR_TYPE_HUNYUANVL: + { + // Compute the HunyuanVL 2D position embedding on CPU (with the + // custom sf=(target+0.1)/n_grid bilinear sampling that the + // reference implementation uses) and upload it to the graph + // input declared in clip_graph_hunyuanocr::build(). + GGML_ASSERT(model.position_embeddings != nullptr); + ggml_tensor * src_t = model.position_embeddings; + const int64_t n_embd = src_t->ne[0]; + const int64_t n_pos = src_t->ne[1]; // = n_grid * n_grid + const int n_grid = (int)std::lround(std::sqrt((double)n_pos)); + GGML_ASSERT((int64_t)n_grid * n_grid == n_pos); + const int out_w = pos_w; // pw + const int out_h = pos_h; // ph + + // Pull weight to host. + std::vector src(n_embd * n_pos); + ggml_backend_tensor_get(src_t, src.data(), 0, ggml_nbytes(src_t)); + + // Output layout matches ggml_new_tensor_2d(F32, n_embd, out_h*out_w): + // ne[0] = n_embd (fastest), ne[1] = out_h*out_w + // dst[(y*out_w + x) * n_embd + c] + std::vector dst((size_t)n_embd * out_h * out_w); + + const float sx = (float)(out_w + 0.1f) / (float)n_grid; + const float sy = (float)(out_h + 0.1f) / (float)n_grid; + + for (int y = 0; y < out_h; ++y) { + // Match ggml_compute_forward_upscale_f32 pixel-center + // convention (align_corners=False): src_y = (y+0.5)/sy - 0.5. + const float fy = ((float)y + 0.5f) / sy - 0.5f; + int y0 = (int)std::floor(fy); + int y1 = y0 + 1; + y0 = std::clamp(y0, 0, n_grid - 1); + y1 = std::clamp(y1, 0, n_grid - 1); + float wy1 = std::clamp(fy - (float)y0, 0.0f, 1.0f); + const float wy0 = 1.0f - wy1; + for (int x = 0; x < out_w; ++x) { + const float fx = ((float)x + 0.5f) / sx - 0.5f; + int x0 = (int)std::floor(fx); + int x1 = x0 + 1; + x0 = std::clamp(x0, 0, n_grid - 1); + x1 = std::clamp(x1, 0, n_grid - 1); + float wx1 = std::clamp(fx - (float)x0, 0.0f, 1.0f); + const float wx0 = 1.0f - wx1; + + const float w00 = wy0 * wx0; + const float w01 = wy0 * wx1; + const float w10 = wy1 * wx0; + const float w11 = wy1 * wx1; + + const float * s00 = &src[((size_t)y0 * n_grid + x0) * n_embd]; + const float * s01 = &src[((size_t)y0 * n_grid + x1) * n_embd]; + const float * s10 = &src[((size_t)y1 * n_grid + x0) * n_embd]; + const float * s11 = &src[((size_t)y1 * n_grid + x1) * n_embd]; + float * d = &dst[((size_t)y * out_w + x) * n_embd]; + for (int c = 0; c < n_embd; ++c) { + d[c] = w00 * s00[c] + w01 * s01[c] + w10 * s10[c] + w11 * s11[c]; + } + } + } + + set_input_f32("hunyuanvl_pos_embd", dst); + } break; case PROJECTOR_TYPE_LLAMA4: { // set the 2D positions @@ -3760,6 +3839,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_YASA2: return ctx->model.mm_2_w->ne[1]; case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: return ctx->model.mm_model_proj->ne[1]; case PROJECTOR_TYPE_COGVLM: return ctx->model.mm_4h_to_h_w->ne[1]; diff --git a/tools/mtmd/models/hunyuanocr.cpp b/tools/mtmd/models/hunyuanocr.cpp index 37d1e2b86..45ed684f7 100644 --- a/tools/mtmd/models/hunyuanocr.cpp +++ b/tools/mtmd/models/hunyuanocr.cpp @@ -5,7 +5,21 @@ ggml_cgraph * clip_graph_hunyuanocr::build() { const int pw = n_patches_x; const int ph = n_patches_y; - ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR); + // Position embedding interpolation. + // HunyuanVL needs scale factors sf=(target+0.1)/n_grid, which the standard + // ggml_interpolate cannot express. To avoid adding a new ggml op, the + // resize is computed on CPU in clip_image_batch_encode and uploaded here + // as a graph input (named "hunyuanvl_pos_embd"). + // HunyuanOCR uses the same square layout and the standard ratio-based + // interpolation provided by resize_position_embeddings(). + ggml_tensor * pos_embd = nullptr; + if (proj_type == PROJECTOR_TYPE_HUNYUANVL && model.position_embeddings) { + pos_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ph * pw); + ggml_set_name(pos_embd, "hunyuanvl_pos_embd"); + ggml_set_input(pos_embd); + } else { + pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR); + } ggml_tensor * inp = build_inp(); ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index cc3de6a85..626361b92 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -35,15 +35,23 @@ struct mtmd_bitmap { // position indexing for decoder model enum mtmd_pos_type { - MTMD_POS_TYPE_NORMAL, // number of positions equals to number of tokens - MTMD_POS_TYPE_MROPE, // qwen-vl mrope style, each image takes max(t,h,w) position indexes + MTMD_POS_TYPE_NORMAL, // number of positions equals to number of tokens + MTMD_POS_TYPE_MROPE, // qwen-vl mrope style, each image takes max(t,h,w) position indexes + MTMD_POS_TYPE_HUNYUANVL, // HunyuanVL mrope + BOI/EOI/newline layout with XD-RoPE dim-3 }; struct mtmd_image_tokens { uint32_t nx; // number of tokens in x direction uint32_t ny; // number of tokens in y direction mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL; - uint32_t n_tokens() const { return nx * ny; } + uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL) + uint32_t n_tokens() const { + if (pos == MTMD_POS_TYPE_HUNYUANVL) { + // [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI] + return (nx + 1) * ny + 2; + } + return nx * ny; + } clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking @@ -52,6 +60,7 @@ struct mtmd_image_tokens { nx, ny, pos, + image_idx, batch_f32.clone(), id }; @@ -466,6 +475,7 @@ struct mtmd_context { image_preproc = std::make_unique(ctx_v); } break; case PROJECTOR_TYPE_HUNYUANOCR: + case PROJECTOR_TYPE_HUNYUANVL: { // note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary img_beg = "<|hy_place▁holder▁no▁100|>"; @@ -611,6 +621,7 @@ struct mtmd_tokenizer { const llama_vocab * vocab; mtmd_input_chunks cur; + uint32_t n_images_added = 0; // 0-based index assigned to the next image chunk mtmd_tokenizer(mtmd_context * ctx, const mtmd_input_text * text, @@ -819,6 +830,14 @@ struct mtmd_tokenizer { image_tokens->ny = 1; } image_tokens->pos = ctx->pos_type; + // HunyuanVL wraps the image grid with BOI/EOI and adds one newline per row, + // and uses XD-RoPE (dim-3 = image index). Override the position type so that + // n_tokens() and mtmd_image_tokens_get_decoder_pos pick the HunyuanVL layout. + if (ctx->proj_type_v() == PROJECTOR_TYPE_HUNYUANVL) { + image_tokens->pos = MTMD_POS_TYPE_HUNYUANVL; + image_tokens->image_idx = n_images_added; + GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens()); + } image_tokens->batch_f32 = std::move(batch_f32); image_tokens->id = bitmap->id; // optional @@ -839,6 +858,9 @@ struct mtmd_tokenizer { add_text(ctx->img_end, true); // add image end token } + // advance image-chunk counter so the next image gets the next XD-RoPE dim-3 slot + n_images_added++; + } else { // handle audio @@ -1286,6 +1308,38 @@ mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * ima pos.y = pos_0 + i; pos.z = pos_0 + i; } break; + case MTMD_POS_TYPE_HUNYUANVL: + { + // HunyuanVL layout: [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI] + // Total = 1 + ny*(nx+1) + 1. BOI and EOI use sequential positions in every dim; + // content and row-newline tokens use (row, col) with XD-RoPE dim-3 = image_idx. + const uint32_t nx = image_tokens->nx; + const uint32_t n_total = image_tokens->n_tokens(); + if (i == 0) { + // BOI + pos.t = pos_0 + i; + pos.x = pos_0 + i; + pos.y = pos_0 + i; + pos.z = pos_0 + i; + } else if (i == n_total - 1) { + // EOI + pos.t = pos_0 + i; + pos.x = pos_0 + i; + pos.y = pos_0 + i; + pos.z = pos_0 + i; + } else { + // content token at (row, col), or the trailing newline of a row (col == nx) + // section 0 = sequential, section 1 = w(col), section 2 = h(row), section 3 = image_count. + // set_position_mrope_2d writes .y -> section 1 and .x -> section 2 + const uint32_t offset = (uint32_t)i - 1; + const uint32_t row = offset / (nx + 1); + const uint32_t col = offset % (nx + 1); + pos.t = pos_0 + i; + pos.x = row; + pos.y = col; + pos.z = image_tokens->image_idx; + } + } break; default: GGML_ABORT("invalid position type"); } @@ -1302,6 +1356,10 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) { return std::max(image_tokens->nx, image_tokens->ny); case MTMD_POS_TYPE_NORMAL: return image_tokens->n_tokens(); + case MTMD_POS_TYPE_HUNYUANVL: + // HunyuanVL: the sequential (dim-0) position advances by the full token count + // (includes BOI/EOI and row newline tokens), not by max(nx, ny) + return image_tokens->n_tokens(); default: GGML_ABORT("invalid position type"); } diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index 5da48d61b..83416fb27 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -91,6 +91,7 @@ add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0" add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR" add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR" +add_test_vision "ggml-org/HunyuanVL-4B-GGUF:Q8_0" add_test_vision "ggml-org/gemma-4-E2B-it-GGUF:Q8_0" --jinja add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"