mtmd, llama : Update HunyuanVL vision-language model support (#22037)

* mtmd, llama : add HunyuanVL vision-language model support

- add LLM_ARCH_HUNYUAN_VL with M-RoPE (XD-RoPE) support
- add PROJECTOR_TYPE_HUNYUANVL with PatchMerger vision encoder
- add HunyuanVL-specific M-RoPE position encoding for image tokens
- add GGUF conversion for HunyuanVL vision and text models
- add smoke test in tools/mtmd/tests.sh

* fix: fix HunyuanVL XD-RoPE h/w section order

* fix: Remove redundant code

* convert : fix HunyuanOCR / HunyuanVL conversion
 - Tested locally: both HunyuanOCR and HunyuanVL-4B convert to GGUF
 - successfully and produce correct inference output on Metal (F16 / Q8_0).

* clip : fix -Werror=misleading-indentation in bilinear resize

* fix CI: convert_hf_to_gguf type check error
 - convert_hf_to_gguf.py: give HunyuanVLTextModel.__init__ an explicit `dir_model: Path` parameter so ty can infer the type for load_hparams instead of reporting `Unknown | None`.

---------

Co-authored-by: wendadawen <wendadawen@tencent.com>
This commit is contained in:
manayang
2026-04-22 17:58:43 +08:00
committed by GitHub
parent 750579ff14
commit 7bfe60fdf9
13 changed files with 336 additions and 27 deletions
+92 -7
View File
@@ -11855,7 +11855,7 @@ class LLaDAMoEModel(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
@ModelBase.register("HunYuanDenseV1ForCausalLM")
class HunYuanModel(TextModel):
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
@@ -11994,28 +11994,58 @@ class HunYuanModel(TextModel):
@ModelBase.register("HunYuanVLForConditionalGeneration")
class HunyuanOCRVisionModel(MmprojModel):
class HunyuanVLVisionModel(MmprojModel):
# Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name
# "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout.
# Each variant maps to a different projector type in clip.cpp so image
# preprocessing follows the correct code path.
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
# HunyuanOCR uses max_image_size instead of image_size
# HunyuanOCR / HunyuanVL uses max_image_size instead of image_size
if "image_size" not in self.hparams_vision:
self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
@staticmethod
def is_ocr_variant(hparams: dict) -> bool:
"""Return True for HunyuanOCR, False for HunyuanVL.
The projector's output dim must equal the text model's hidden_size by
construction (that's what "projector" means). HunyuanOCR pairs a 1B text
backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the
ViT -> LLM projection dim is a hard architectural signature, not a
magic number.
"""
vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0))
return vision_out == 1024
def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_vision is not None
hparams = self.hparams_vision
vcfg = self.hparams_vision
if self.is_ocr_variant(self.global_config):
# --- HunyuanOCR ---
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
self.gguf_writer.add_vision_use_gelu(True)
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
return
# --- HunyuanVL ---
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu")
self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"]))
self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"]))
self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if not name.startswith("vit."):
return # skip text tensors
return
# strip CLS token (row 0) from position embeddings so resize_position_embeddings works
if "position_embedding" in name:
data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd]
@@ -12023,11 +12053,66 @@ class HunyuanOCRVisionModel(MmprojModel):
def tensor_force_quant(self, name, new_name, bid, n_dims):
# force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
# Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
@ModelBase.register("HunYuanVLForConditionalGeneration")
class HunyuanVLTextModel(HunYuanModel):
# The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR
# and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE),
# while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from
# the config and pick the matching GGUF architecture.
model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
@staticmethod
def _is_ocr_config(hparams: dict) -> bool:
# OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that
# outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with
# HunyuanVLVisionModel.is_ocr_variant.
return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024
def __init__(self, dir_model: Path, *args, **kwargs):
raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False)
if self._is_ocr_config(raw_hparams):
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
else:
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
super().__init__(dir_model, *args, **kwargs)
def set_gguf_parameters(self):
super().set_gguf_parameters()
# Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses
# the HunYuan-Dense arch which already handles standard rope in super().
if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL:
return
if self.rope_parameters.get("rope_type") != "xdrope":
return
# defaults for HunyuanVL. The C++ side later computes:
# freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2))
self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"]))
self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"]))
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1)))
ctx_len = int(self.hparams["max_position_embeddings"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len)
self.gguf_writer.add_context_length(ctx_len)
self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"]))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Skip vision tensors — they are written by HunyuanVLVisionModel
if name.startswith("vit."):
return
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("SmolLM3ForCausalLM")
class SmolLM3Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.SMOLLM3
+20
View File
@@ -197,6 +197,7 @@ class Keys:
FREQ_BASE_SWA = "{arch}.rope.freq_base_swa"
SCALING_TYPE = "{arch}.rope.scaling.type"
SCALING_FACTOR = "{arch}.rope.scaling.factor"
SCALING_ALPHA = "{arch}.rope.scaling.alpha"
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
@@ -471,6 +472,7 @@ class MODEL_ARCH(IntEnum):
ERNIE4_5_MOE = auto()
HUNYUAN_MOE = auto()
HUNYUAN_DENSE = auto()
HUNYUAN_VL = auto()
SMOLLM3 = auto()
GPT_OSS = auto()
LFM2 = auto()
@@ -957,6 +959,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.FALCON_H1: "falcon-h1",
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense",
MODEL_ARCH.HUNYUAN_VL: "hunyuan_vl",
MODEL_ARCH.SMOLLM3: "smollm3",
MODEL_ARCH.GPT_OSS: "gpt-oss",
MODEL_ARCH.LFM2: "lfm2",
@@ -3489,6 +3492,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.HUNYUAN_VL: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.SMOLLM3: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@@ -4138,6 +4157,7 @@ class VisionProjectorType:
YOUTUVL = "youtuvl"
NEMOTRON_V2_VL = "nemotron_v2_vl"
HUNYUANOCR = "hunyuanocr"
HUNYUANVL = "hunyuanvl"
# Items here are (block size, type size)
+3
View File
@@ -973,6 +973,9 @@ class GGUFWriter:
def add_rope_scaling_factor(self, value: float) -> None:
self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
def add_rope_scaling_alpha(self, value: float) -> None:
self.add_float32(Keys.Rope.SCALING_ALPHA.format(arch=self.arch), value)
def add_rope_scaling_attn_factors(self, value: float) -> None:
self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
+2
View File
@@ -109,6 +109,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
{ LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
{ LLM_ARCH_HUNYUAN_VL, "hunyuan_vl" },
{ LLM_ARCH_SMOLLM3, "smollm3" },
{ LLM_ARCH_OPENAI_MOE, "gpt-oss" },
{ LLM_ARCH_LFM2, "lfm2" },
@@ -250,6 +251,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
{ LLM_KV_ROPE_SCALING_ALPHA, "%s.rope.scaling.alpha" },
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
+2
View File
@@ -113,6 +113,7 @@ enum llm_arch {
LLM_ARCH_ERNIE4_5_MOE,
LLM_ARCH_HUNYUAN_MOE,
LLM_ARCH_HUNYUAN_DENSE,
LLM_ARCH_HUNYUAN_VL,
LLM_ARCH_SMOLLM3,
LLM_ARCH_OPENAI_MOE,
LLM_ARCH_LFM2,
@@ -254,6 +255,7 @@ enum llm_kv {
LLM_KV_ROPE_SCALE_LINEAR,
LLM_KV_ROPE_SCALING_TYPE,
LLM_KV_ROPE_SCALING_FACTOR,
LLM_KV_ROPE_SCALING_ALPHA,
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
LLM_KV_ROPE_SCALING_FINETUNED,
+1
View File
@@ -116,6 +116,7 @@ struct llama_hparams {
float rope_freq_base_train_swa = 10000.0f;
float rope_freq_scale_train;
float rope_freq_scale_train_swa = 1.0f;
float rope_scaling_alpha = 0.0f; // NTK-aware alpha for XDRoPE
uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul = 0.0f;
+22
View File
@@ -737,6 +737,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false);
ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false);
if (arch == LLM_ARCH_HUNYUAN_VL || arch == LLM_ARCH_HUNYUAN_DENSE) {
if (hparams.n_expert <= 1) {
hparams.n_expert = 0;
hparams.n_expert_used = 0;
}
}
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd);
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl);
@@ -815,6 +822,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
ml.get_key(LLM_KV_ROPE_SCALING_ALPHA, hparams.rope_scaling_alpha, false);
// non-transformer models do not have attention heads
if (hparams.n_head() > 0) {
@@ -2592,9 +2600,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_HUNYUAN_VL:
case LLM_ARCH_HUNYUAN_DENSE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
// XDRoPE / NTK-aware scaling: base = rope_theta * alpha^(dim / (dim - 2))
if (hparams.rope_scaling_alpha > 0.0f) {
const int dim = hparams.n_embd_head_k();
hparams.rope_freq_base_train = hparams.rope_freq_base_train
* powf(hparams.rope_scaling_alpha, (float)dim / (float)(dim - 2));
}
switch (hparams.n_embd) {
case 1024: type = LLM_TYPE_0_5B; break;
@@ -6947,6 +6964,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
}
} break;
case LLM_ARCH_HUNYUAN_VL:
case LLM_ARCH_HUNYUAN_DENSE:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -8967,6 +8985,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
} break;
case LLM_ARCH_HUNYUAN_VL:
case LLM_ARCH_HUNYUAN_DENSE:
{
llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
@@ -9316,6 +9335,9 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_GLM4_MOE:
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
case LLM_ARCH_HUNYUAN_VL:
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
// all model arches should be listed explicitly here
case LLM_ARCH_UNKNOWN:
GGML_ABORT("unknown architecture");
+23 -4
View File
@@ -6,6 +6,11 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_embd_head == n_rot);
const bool use_mrope = hparams.use_mrope();
int sections[4];
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
ggml_tensor * cur;
ggml_tensor * inpL;
@@ -37,21 +42,35 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
n_embd_head, n_head, n_head_kv, il);
if (use_mrope) {
Qcur = ggml_rope_multi(
ctx0, Qcur, inp_pos, rope_factors,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_multi(
ctx0, Kcur, inp_pos, rope_factors,
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
} else {
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
}
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
Kcur = build_norm(Kcur,
model.layers[il].attn_k_norm, nullptr,
+3 -1
View File
@@ -150,7 +150,7 @@
#define TN_TOK_BOI "v.boi"
#define TN_TOK_EOI "v.eoi"
// hunyuanocr
// hunyuanocr / hunyuanvl (shared GGUF tensor names)
#define TN_MM_PRE_NORM "mm.pre_norm.%s"
#define TN_TOK_IMG_BEGIN "mm.image_begin"
#define TN_TOK_IMG_END "mm.image_end"
@@ -303,6 +303,7 @@ enum projector_type {
PROJECTOR_TYPE_KIMIK25,
PROJECTOR_TYPE_NEMOTRON_V2_VL,
PROJECTOR_TYPE_HUNYUANOCR,
PROJECTOR_TYPE_HUNYUANVL,
PROJECTOR_TYPE_UNKNOWN,
};
@@ -349,6 +350,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
{ PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
{ PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {
+80
View File
@@ -912,6 +912,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_HUNYUANVL:
{
builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
} break;
@@ -1473,6 +1474,16 @@ struct clip_model_loader {
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
hparams.set_warmup_n_tokens(28*28);
} break;
case PROJECTOR_TYPE_HUNYUANVL:
{
hparams.n_merge = 2;
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
hparams.image_resize_pad = false;
hparams.ffn_op = FFN_GELU;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
hparams.set_limit_image_tokens(256, 16384);
hparams.set_warmup_n_tokens(32*32);
} break;
case PROJECTOR_TYPE_LFM2A:
{
// audio preprocessing params
@@ -2222,6 +2233,7 @@ struct clip_model_loader {
model.mm_eoi = get_tensor(TN_TOK_EOI);
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_HUNYUANVL:
{
// proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
@@ -2860,6 +2872,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_PADDLEOCR:
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_HUNYUANVL:
case PROJECTOR_TYPE_YOUTUVL:
return (img->nx / params.patch_size) / 2;
case PROJECTOR_TYPE_STEP3VL:
@@ -2879,6 +2892,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_PADDLEOCR:
case PROJECTOR_TYPE_HUNYUANVL:
case PROJECTOR_TYPE_YOUTUVL:
return (img->ny / params.patch_size) / 2;
case PROJECTOR_TYPE_STEP3VL:
@@ -3070,6 +3084,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
n_patches = h * (h + 1) + 1;
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_HUNYUANVL:
{
int merge = ctx->model.hparams.n_merge;
int ow = (img->nx / patch_size) / merge;
@@ -3534,6 +3549,70 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
{
// do nothing
} break;
case PROJECTOR_TYPE_HUNYUANVL:
{
// Compute the HunyuanVL 2D position embedding on CPU (with the
// custom sf=(target+0.1)/n_grid bilinear sampling that the
// reference implementation uses) and upload it to the graph
// input declared in clip_graph_hunyuanocr::build().
GGML_ASSERT(model.position_embeddings != nullptr);
ggml_tensor * src_t = model.position_embeddings;
const int64_t n_embd = src_t->ne[0];
const int64_t n_pos = src_t->ne[1]; // = n_grid * n_grid
const int n_grid = (int)std::lround(std::sqrt((double)n_pos));
GGML_ASSERT((int64_t)n_grid * n_grid == n_pos);
const int out_w = pos_w; // pw
const int out_h = pos_h; // ph
// Pull weight to host.
std::vector<float> src(n_embd * n_pos);
ggml_backend_tensor_get(src_t, src.data(), 0, ggml_nbytes(src_t));
// Output layout matches ggml_new_tensor_2d(F32, n_embd, out_h*out_w):
// ne[0] = n_embd (fastest), ne[1] = out_h*out_w
// dst[(y*out_w + x) * n_embd + c]
std::vector<float> dst((size_t)n_embd * out_h * out_w);
const float sx = (float)(out_w + 0.1f) / (float)n_grid;
const float sy = (float)(out_h + 0.1f) / (float)n_grid;
for (int y = 0; y < out_h; ++y) {
// Match ggml_compute_forward_upscale_f32 pixel-center
// convention (align_corners=False): src_y = (y+0.5)/sy - 0.5.
const float fy = ((float)y + 0.5f) / sy - 0.5f;
int y0 = (int)std::floor(fy);
int y1 = y0 + 1;
y0 = std::clamp(y0, 0, n_grid - 1);
y1 = std::clamp(y1, 0, n_grid - 1);
float wy1 = std::clamp(fy - (float)y0, 0.0f, 1.0f);
const float wy0 = 1.0f - wy1;
for (int x = 0; x < out_w; ++x) {
const float fx = ((float)x + 0.5f) / sx - 0.5f;
int x0 = (int)std::floor(fx);
int x1 = x0 + 1;
x0 = std::clamp(x0, 0, n_grid - 1);
x1 = std::clamp(x1, 0, n_grid - 1);
float wx1 = std::clamp(fx - (float)x0, 0.0f, 1.0f);
const float wx0 = 1.0f - wx1;
const float w00 = wy0 * wx0;
const float w01 = wy0 * wx1;
const float w10 = wy1 * wx0;
const float w11 = wy1 * wx1;
const float * s00 = &src[((size_t)y0 * n_grid + x0) * n_embd];
const float * s01 = &src[((size_t)y0 * n_grid + x1) * n_embd];
const float * s10 = &src[((size_t)y1 * n_grid + x0) * n_embd];
const float * s11 = &src[((size_t)y1 * n_grid + x1) * n_embd];
float * d = &dst[((size_t)y * out_w + x) * n_embd];
for (int c = 0; c < n_embd; ++c) {
d[c] = w00 * s00[c] + w01 * s01[c] + w10 * s10[c] + w11 * s11[c];
}
}
}
set_input_f32("hunyuanvl_pos_embd", dst);
} break;
case PROJECTOR_TYPE_LLAMA4:
{
// set the 2D positions
@@ -3760,6 +3839,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_YASA2:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_HUNYUANVL:
return ctx->model.mm_model_proj->ne[1];
case PROJECTOR_TYPE_COGVLM:
return ctx->model.mm_4h_to_h_w->ne[1];
+15 -1
View File
@@ -5,7 +5,21 @@ ggml_cgraph * clip_graph_hunyuanocr::build() {
const int pw = n_patches_x;
const int ph = n_patches_y;
ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
// Position embedding interpolation.
// HunyuanVL needs scale factors sf=(target+0.1)/n_grid, which the standard
// ggml_interpolate cannot express. To avoid adding a new ggml op, the
// resize is computed on CPU in clip_image_batch_encode and uploaded here
// as a graph input (named "hunyuanvl_pos_embd").
// HunyuanOCR uses the same square layout and the standard ratio-based
// interpolation provided by resize_position_embeddings().
ggml_tensor * pos_embd = nullptr;
if (proj_type == PROJECTOR_TYPE_HUNYUANVL && model.position_embeddings) {
pos_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ph * pw);
ggml_set_name(pos_embd, "hunyuanvl_pos_embd");
ggml_set_input(pos_embd);
} else {
pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
}
ggml_tensor * inp = build_inp();
ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr);
+59 -1
View File
@@ -37,13 +37,21 @@ struct mtmd_bitmap {
enum mtmd_pos_type {
MTMD_POS_TYPE_NORMAL, // number of positions equals to number of tokens
MTMD_POS_TYPE_MROPE, // qwen-vl mrope style, each image takes max(t,h,w) position indexes
MTMD_POS_TYPE_HUNYUANVL, // HunyuanVL mrope + BOI/EOI/newline layout with XD-RoPE dim-3
};
struct mtmd_image_tokens {
uint32_t nx; // number of tokens in x direction
uint32_t ny; // number of tokens in y direction
mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL;
uint32_t n_tokens() const { return nx * ny; }
uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL)
uint32_t n_tokens() const {
if (pos == MTMD_POS_TYPE_HUNYUANVL) {
// [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
return (nx + 1) * ny + 2;
}
return nx * ny;
}
clip_image_f32_batch batch_f32; // preprocessed image patches
std::string id; // optional user-defined ID, useful for KV cache tracking
@@ -52,6 +60,7 @@ struct mtmd_image_tokens {
nx,
ny,
pos,
image_idx,
batch_f32.clone(),
id
};
@@ -466,6 +475,7 @@ struct mtmd_context {
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_HUNYUANVL:
{
// note: these use fullwidth (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
img_beg = "<hy_place▁holder▁no▁100>";
@@ -611,6 +621,7 @@ struct mtmd_tokenizer {
const llama_vocab * vocab;
mtmd_input_chunks cur;
uint32_t n_images_added = 0; // 0-based index assigned to the next image chunk
mtmd_tokenizer(mtmd_context * ctx,
const mtmd_input_text * text,
@@ -819,6 +830,14 @@ struct mtmd_tokenizer {
image_tokens->ny = 1;
}
image_tokens->pos = ctx->pos_type;
// HunyuanVL wraps the image grid with BOI/EOI and adds one newline per row,
// and uses XD-RoPE (dim-3 = image index). Override the position type so that
// n_tokens() and mtmd_image_tokens_get_decoder_pos pick the HunyuanVL layout.
if (ctx->proj_type_v() == PROJECTOR_TYPE_HUNYUANVL) {
image_tokens->pos = MTMD_POS_TYPE_HUNYUANVL;
image_tokens->image_idx = n_images_added;
GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens());
}
image_tokens->batch_f32 = std::move(batch_f32);
image_tokens->id = bitmap->id; // optional
@@ -839,6 +858,9 @@ struct mtmd_tokenizer {
add_text(ctx->img_end, true); // add image end token
}
// advance image-chunk counter so the next image gets the next XD-RoPE dim-3 slot
n_images_added++;
} else {
// handle audio
@@ -1286,6 +1308,38 @@ mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * ima
pos.y = pos_0 + i;
pos.z = pos_0 + i;
} break;
case MTMD_POS_TYPE_HUNYUANVL:
{
// HunyuanVL layout: [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
// Total = 1 + ny*(nx+1) + 1. BOI and EOI use sequential positions in every dim;
// content and row-newline tokens use (row, col) with XD-RoPE dim-3 = image_idx.
const uint32_t nx = image_tokens->nx;
const uint32_t n_total = image_tokens->n_tokens();
if (i == 0) {
// BOI
pos.t = pos_0 + i;
pos.x = pos_0 + i;
pos.y = pos_0 + i;
pos.z = pos_0 + i;
} else if (i == n_total - 1) {
// EOI
pos.t = pos_0 + i;
pos.x = pos_0 + i;
pos.y = pos_0 + i;
pos.z = pos_0 + i;
} else {
// content token at (row, col), or the trailing newline of a row (col == nx)
// section 0 = sequential, section 1 = w(col), section 2 = h(row), section 3 = image_count.
// set_position_mrope_2d writes .y -> section 1 and .x -> section 2
const uint32_t offset = (uint32_t)i - 1;
const uint32_t row = offset / (nx + 1);
const uint32_t col = offset % (nx + 1);
pos.t = pos_0 + i;
pos.x = row;
pos.y = col;
pos.z = image_tokens->image_idx;
}
} break;
default:
GGML_ABORT("invalid position type");
}
@@ -1302,6 +1356,10 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
return std::max(image_tokens->nx, image_tokens->ny);
case MTMD_POS_TYPE_NORMAL:
return image_tokens->n_tokens();
case MTMD_POS_TYPE_HUNYUANVL:
// HunyuanVL: the sequential (dim-0) position advances by the full token count
// (includes BOI/EOI and row newline tokens), not by max(nx, ny)
return image_tokens->n_tokens();
default:
GGML_ABORT("invalid position type");
}
+1
View File
@@ -91,6 +91,7 @@ add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr
add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR"
add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR"
add_test_vision "ggml-org/HunyuanVL-4B-GGUF:Q8_0"
add_test_vision "ggml-org/gemma-4-E2B-it-GGUF:Q8_0" --jinja
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"