mtmd, llama : Update HunyuanVL vision-language model support (#22037)
* mtmd, llama : add HunyuanVL vision-language model support - add LLM_ARCH_HUNYUAN_VL with M-RoPE (XD-RoPE) support - add PROJECTOR_TYPE_HUNYUANVL with PatchMerger vision encoder - add HunyuanVL-specific M-RoPE position encoding for image tokens - add GGUF conversion for HunyuanVL vision and text models - add smoke test in tools/mtmd/tests.sh * fix: fix HunyuanVL XD-RoPE h/w section order * fix: Remove redundant code * convert : fix HunyuanOCR / HunyuanVL conversion - Tested locally: both HunyuanOCR and HunyuanVL-4B convert to GGUF - successfully and produce correct inference output on Metal (F16 / Q8_0). * clip : fix -Werror=misleading-indentation in bilinear resize * fix CI: convert_hf_to_gguf type check error - convert_hf_to_gguf.py: give HunyuanVLTextModel.__init__ an explicit `dir_model: Path` parameter so ty can infer the type for load_hparams instead of reporting `Unknown | None`. --------- Co-authored-by: wendadawen <wendadawen@tencent.com>
This commit is contained in:
+96
-11
@@ -11855,7 +11855,7 @@ class LLaDAMoEModel(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
|
||||
@ModelBase.register("HunYuanDenseV1ForCausalLM")
|
||||
class HunYuanModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
||||
|
||||
@@ -11994,28 +11994,58 @@ class HunYuanModel(TextModel):
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanVLForConditionalGeneration")
|
||||
class HunyuanOCRVisionModel(MmprojModel):
|
||||
class HunyuanVLVisionModel(MmprojModel):
|
||||
# Handles both HunyuanOCR and HunyuanVL, which share the HF architecture name
|
||||
# "HunYuanVLForConditionalGeneration" and the `vit.perceive.*` vision layout.
|
||||
# Each variant maps to a different projector type in clip.cpp so image
|
||||
# preprocessing follows the correct code path.
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
# HunyuanOCR uses max_image_size instead of image_size
|
||||
# HunyuanOCR / HunyuanVL uses max_image_size instead of image_size
|
||||
if "image_size" not in self.hparams_vision:
|
||||
self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
|
||||
|
||||
@staticmethod
|
||||
def is_ocr_variant(hparams: dict) -> bool:
|
||||
"""Return True for HunyuanOCR, False for HunyuanVL.
|
||||
|
||||
The projector's output dim must equal the text model's hidden_size by
|
||||
construction (that's what "projector" means). HunyuanOCR pairs a 1B text
|
||||
backbone (hidden=1024); HunyuanVL pairs a 4B one (hidden=3072). So the
|
||||
ViT -> LLM projection dim is a hard architectural signature, not a
|
||||
magic number.
|
||||
"""
|
||||
vision_out = int((hparams.get("vision_config") or {}).get("out_hidden_size", 0))
|
||||
return vision_out == 1024
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
assert self.hparams_vision is not None
|
||||
hparams = self.hparams_vision
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
|
||||
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
|
||||
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
|
||||
vcfg = self.hparams_vision
|
||||
|
||||
if self.is_ocr_variant(self.global_config):
|
||||
# --- HunyuanOCR ---
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(vcfg.get("rms_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(vcfg.get("spatial_merge_size", 2))
|
||||
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
|
||||
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
|
||||
return
|
||||
|
||||
# --- HunyuanVL ---
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANVL)
|
||||
self.gguf_writer.add_vision_use_gelu(str(vcfg["hidden_act"]).lower() == "gelu")
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(float(vcfg["rms_norm_eps"]))
|
||||
self.gguf_writer.add_vision_spatial_merge_size(int(vcfg["spatial_merge_size"]))
|
||||
self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
|
||||
self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if not name.startswith("vit."):
|
||||
return # skip text tensors
|
||||
return
|
||||
# strip CLS token (row 0) from position embeddings so resize_position_embeddings works
|
||||
if "position_embedding" in name:
|
||||
data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd]
|
||||
@@ -12023,11 +12053,66 @@ class HunyuanOCRVisionModel(MmprojModel):
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
# force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
|
||||
# Both HunyuanOCR and HunyuanVL emit the ViT -> LLM projection as mm.0/mm.2.
|
||||
if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
|
||||
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
|
||||
@ModelBase.register("HunYuanVLForConditionalGeneration")
|
||||
class HunyuanVLTextModel(HunYuanModel):
|
||||
# The "HunYuanVLForConditionalGeneration" HF architecture covers both HunyuanOCR
|
||||
# and HunyuanVL. HunyuanOCR reuses the HunYuan-Dense text backbone (standard RoPE),
|
||||
# while HunyuanVL introduces a new LLM arch with XD-RoPE. Detect the variant from
|
||||
# the config and pick the matching GGUF architecture.
|
||||
model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
|
||||
|
||||
@staticmethod
|
||||
def _is_ocr_config(hparams: dict) -> bool:
|
||||
# OCR pairs a 1B text backbone (hidden=1024) with a ViT projector that
|
||||
# outputs 1024-d; HunyuanVL uses 3072-d. Keep in sync with
|
||||
# HunyuanVLVisionModel.is_ocr_variant.
|
||||
return int((hparams.get("vision_config") or {}).get("out_hidden_size", 0)) == 1024
|
||||
|
||||
def __init__(self, dir_model: Path, *args, **kwargs):
|
||||
raw_hparams = kwargs.get("hparams") or ModelBase.load_hparams(dir_model, is_mistral_format=False)
|
||||
if self._is_ocr_config(raw_hparams):
|
||||
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
|
||||
else:
|
||||
self.model_arch = gguf.MODEL_ARCH.HUNYUAN_VL
|
||||
super().__init__(dir_model, *args, **kwargs)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
# Only emit XD-RoPE metadata for the HunyuanVL backbone; HunyuanOCR uses
|
||||
# the HunYuan-Dense arch which already handles standard rope in super().
|
||||
if self.model_arch != gguf.MODEL_ARCH.HUNYUAN_VL:
|
||||
return
|
||||
|
||||
if self.rope_parameters.get("rope_type") != "xdrope":
|
||||
return
|
||||
|
||||
# defaults for HunyuanVL. The C++ side later computes:
|
||||
# freq_base = rope_theta * alpha ** (head_dim / (head_dim - 2))
|
||||
self.gguf_writer.add_rope_freq_base(float(self.rope_parameters["rope_theta"]))
|
||||
self.gguf_writer.add_rope_scaling_alpha(float(self.rope_parameters["alpha"]))
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
self.gguf_writer.add_rope_scaling_factor(float(self.rope_parameters.get("factor", 1)))
|
||||
|
||||
ctx_len = int(self.hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(ctx_len)
|
||||
self.gguf_writer.add_context_length(ctx_len)
|
||||
|
||||
self.gguf_writer.add_rope_dimension_sections(list(self.rope_parameters["xdrope_section"]))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# Skip vision tensors — they are written by HunyuanVLVisionModel
|
||||
if name.startswith("vit."):
|
||||
return
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("SmolLM3ForCausalLM")
|
||||
class SmolLM3Model(LlamaModel):
|
||||
model_arch = gguf.MODEL_ARCH.SMOLLM3
|
||||
|
||||
@@ -197,6 +197,7 @@ class Keys:
|
||||
FREQ_BASE_SWA = "{arch}.rope.freq_base_swa"
|
||||
SCALING_TYPE = "{arch}.rope.scaling.type"
|
||||
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
||||
SCALING_ALPHA = "{arch}.rope.scaling.alpha"
|
||||
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
|
||||
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||
@@ -471,6 +472,7 @@ class MODEL_ARCH(IntEnum):
|
||||
ERNIE4_5_MOE = auto()
|
||||
HUNYUAN_MOE = auto()
|
||||
HUNYUAN_DENSE = auto()
|
||||
HUNYUAN_VL = auto()
|
||||
SMOLLM3 = auto()
|
||||
GPT_OSS = auto()
|
||||
LFM2 = auto()
|
||||
@@ -957,6 +959,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.FALCON_H1: "falcon-h1",
|
||||
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
|
||||
MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense",
|
||||
MODEL_ARCH.HUNYUAN_VL: "hunyuan_vl",
|
||||
MODEL_ARCH.SMOLLM3: "smollm3",
|
||||
MODEL_ARCH.GPT_OSS: "gpt-oss",
|
||||
MODEL_ARCH.LFM2: "lfm2",
|
||||
@@ -3489,6 +3492,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.HUNYUAN_VL: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.SMOLLM3: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
@@ -4138,6 +4157,7 @@ class VisionProjectorType:
|
||||
YOUTUVL = "youtuvl"
|
||||
NEMOTRON_V2_VL = "nemotron_v2_vl"
|
||||
HUNYUANOCR = "hunyuanocr"
|
||||
HUNYUANVL = "hunyuanvl"
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
|
||||
@@ -973,6 +973,9 @@ class GGUFWriter:
|
||||
def add_rope_scaling_factor(self, value: float) -> None:
|
||||
self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scaling_alpha(self, value: float) -> None:
|
||||
self.add_float32(Keys.Rope.SCALING_ALPHA.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scaling_attn_factors(self, value: float) -> None:
|
||||
self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
|
||||
|
||||
|
||||
@@ -109,6 +109,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
|
||||
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
||||
{ LLM_ARCH_HUNYUAN_DENSE, "hunyuan-dense" },
|
||||
{ LLM_ARCH_HUNYUAN_VL, "hunyuan_vl" },
|
||||
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
||||
{ LLM_ARCH_OPENAI_MOE, "gpt-oss" },
|
||||
{ LLM_ARCH_LFM2, "lfm2" },
|
||||
@@ -250,6 +251,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
||||
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
||||
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
||||
{ LLM_KV_ROPE_SCALING_ALPHA, "%s.rope.scaling.alpha" },
|
||||
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
||||
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
||||
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
||||
|
||||
@@ -113,6 +113,7 @@ enum llm_arch {
|
||||
LLM_ARCH_ERNIE4_5_MOE,
|
||||
LLM_ARCH_HUNYUAN_MOE,
|
||||
LLM_ARCH_HUNYUAN_DENSE,
|
||||
LLM_ARCH_HUNYUAN_VL,
|
||||
LLM_ARCH_SMOLLM3,
|
||||
LLM_ARCH_OPENAI_MOE,
|
||||
LLM_ARCH_LFM2,
|
||||
@@ -254,6 +255,7 @@ enum llm_kv {
|
||||
LLM_KV_ROPE_SCALE_LINEAR,
|
||||
LLM_KV_ROPE_SCALING_TYPE,
|
||||
LLM_KV_ROPE_SCALING_FACTOR,
|
||||
LLM_KV_ROPE_SCALING_ALPHA,
|
||||
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
||||
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
||||
LLM_KV_ROPE_SCALING_FINETUNED,
|
||||
|
||||
@@ -116,6 +116,7 @@ struct llama_hparams {
|
||||
float rope_freq_base_train_swa = 10000.0f;
|
||||
float rope_freq_scale_train;
|
||||
float rope_freq_scale_train_swa = 1.0f;
|
||||
float rope_scaling_alpha = 0.0f; // NTK-aware alpha for XDRoPE
|
||||
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
float rope_yarn_log_mul = 0.0f;
|
||||
|
||||
@@ -737,6 +737,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false);
|
||||
|
||||
if (arch == LLM_ARCH_HUNYUAN_VL || arch == LLM_ARCH_HUNYUAN_DENSE) {
|
||||
if (hparams.n_expert <= 1) {
|
||||
hparams.n_expert = 0;
|
||||
hparams.n_expert_used = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl);
|
||||
@@ -815,6 +822,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_ALPHA, hparams.rope_scaling_alpha, false);
|
||||
|
||||
// non-transformer models do not have attention heads
|
||||
if (hparams.n_head() > 0) {
|
||||
@@ -2592,9 +2600,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_HUNYUAN_VL:
|
||||
case LLM_ARCH_HUNYUAN_DENSE:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
||||
|
||||
// XDRoPE / NTK-aware scaling: base = rope_theta * alpha^(dim / (dim - 2))
|
||||
if (hparams.rope_scaling_alpha > 0.0f) {
|
||||
const int dim = hparams.n_embd_head_k();
|
||||
hparams.rope_freq_base_train = hparams.rope_freq_base_train
|
||||
* powf(hparams.rope_scaling_alpha, (float)dim / (float)(dim - 2));
|
||||
}
|
||||
|
||||
switch (hparams.n_embd) {
|
||||
case 1024: type = LLM_TYPE_0_5B; break;
|
||||
@@ -6947,6 +6964,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_HUNYUAN_VL:
|
||||
case LLM_ARCH_HUNYUAN_DENSE:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -8967,6 +8985,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
||||
{
|
||||
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
|
||||
} break;
|
||||
case LLM_ARCH_HUNYUAN_VL:
|
||||
case LLM_ARCH_HUNYUAN_DENSE:
|
||||
{
|
||||
llm = std::make_unique<llm_build_hunyuan_dense>(*this, params);
|
||||
@@ -9316,6 +9335,9 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GLM4_MOE:
|
||||
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
case LLM_ARCH_HUNYUAN_VL:
|
||||
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
// all model arches should be listed explicitly here
|
||||
case LLM_ARCH_UNKNOWN:
|
||||
GGML_ABORT("unknown architecture");
|
||||
|
||||
@@ -6,6 +6,11 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
|
||||
GGML_ASSERT(n_embd_head == n_rot);
|
||||
|
||||
const bool use_mrope = hparams.use_mrope();
|
||||
|
||||
int sections[4];
|
||||
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
||||
|
||||
ggml_tensor * cur;
|
||||
ggml_tensor * inpL;
|
||||
|
||||
@@ -37,22 +42,36 @@ llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, cons
|
||||
auto [Qcur, Kcur, Vcur] = build_qkv(model.layers[il], cur,
|
||||
n_embd_head, n_head, n_head_kv, il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
if (use_mrope) {
|
||||
Qcur = ggml_rope_multi(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_multi(
|
||||
ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
} else {
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
}
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, rope_factors,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
|
||||
Kcur = build_norm(Kcur,
|
||||
model.layers[il].attn_k_norm, nullptr,
|
||||
LLM_NORM_RMS, il);
|
||||
|
||||
@@ -150,7 +150,7 @@
|
||||
#define TN_TOK_BOI "v.boi"
|
||||
#define TN_TOK_EOI "v.eoi"
|
||||
|
||||
// hunyuanocr
|
||||
// hunyuanocr / hunyuanvl (shared GGUF tensor names)
|
||||
#define TN_MM_PRE_NORM "mm.pre_norm.%s"
|
||||
#define TN_TOK_IMG_BEGIN "mm.image_begin"
|
||||
#define TN_TOK_IMG_END "mm.image_end"
|
||||
@@ -303,6 +303,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_KIMIK25,
|
||||
PROJECTOR_TYPE_NEMOTRON_V2_VL,
|
||||
PROJECTOR_TYPE_HUNYUANOCR,
|
||||
PROJECTOR_TYPE_HUNYUANVL,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -349,6 +350,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
|
||||
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
|
||||
{ PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
|
||||
{ PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
||||
@@ -912,6 +912,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
|
||||
} break;
|
||||
@@ -1473,6 +1474,16 @@ struct clip_model_loader {
|
||||
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
|
||||
hparams.set_warmup_n_tokens(28*28);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
hparams.n_merge = 2;
|
||||
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
|
||||
hparams.image_resize_pad = false;
|
||||
hparams.ffn_op = FFN_GELU;
|
||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||
hparams.set_limit_image_tokens(256, 16384);
|
||||
hparams.set_warmup_n_tokens(32*32);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
// audio preprocessing params
|
||||
@@ -2222,6 +2233,7 @@ struct clip_model_loader {
|
||||
model.mm_eoi = get_tensor(TN_TOK_EOI);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
// proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
|
||||
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
@@ -2860,6 +2872,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->nx / params.patch_size) / 2;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
@@ -2879,6 +2892,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->ny / params.patch_size) / 2;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
@@ -3070,6 +3084,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
n_patches = h * (h + 1) + 1;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
int merge = ctx->model.hparams.n_merge;
|
||||
int ow = (img->nx / patch_size) / merge;
|
||||
@@ -3534,6 +3549,70 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
{
|
||||
// do nothing
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
// Compute the HunyuanVL 2D position embedding on CPU (with the
|
||||
// custom sf=(target+0.1)/n_grid bilinear sampling that the
|
||||
// reference implementation uses) and upload it to the graph
|
||||
// input declared in clip_graph_hunyuanocr::build().
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
ggml_tensor * src_t = model.position_embeddings;
|
||||
const int64_t n_embd = src_t->ne[0];
|
||||
const int64_t n_pos = src_t->ne[1]; // = n_grid * n_grid
|
||||
const int n_grid = (int)std::lround(std::sqrt((double)n_pos));
|
||||
GGML_ASSERT((int64_t)n_grid * n_grid == n_pos);
|
||||
const int out_w = pos_w; // pw
|
||||
const int out_h = pos_h; // ph
|
||||
|
||||
// Pull weight to host.
|
||||
std::vector<float> src(n_embd * n_pos);
|
||||
ggml_backend_tensor_get(src_t, src.data(), 0, ggml_nbytes(src_t));
|
||||
|
||||
// Output layout matches ggml_new_tensor_2d(F32, n_embd, out_h*out_w):
|
||||
// ne[0] = n_embd (fastest), ne[1] = out_h*out_w
|
||||
// dst[(y*out_w + x) * n_embd + c]
|
||||
std::vector<float> dst((size_t)n_embd * out_h * out_w);
|
||||
|
||||
const float sx = (float)(out_w + 0.1f) / (float)n_grid;
|
||||
const float sy = (float)(out_h + 0.1f) / (float)n_grid;
|
||||
|
||||
for (int y = 0; y < out_h; ++y) {
|
||||
// Match ggml_compute_forward_upscale_f32 pixel-center
|
||||
// convention (align_corners=False): src_y = (y+0.5)/sy - 0.5.
|
||||
const float fy = ((float)y + 0.5f) / sy - 0.5f;
|
||||
int y0 = (int)std::floor(fy);
|
||||
int y1 = y0 + 1;
|
||||
y0 = std::clamp(y0, 0, n_grid - 1);
|
||||
y1 = std::clamp(y1, 0, n_grid - 1);
|
||||
float wy1 = std::clamp(fy - (float)y0, 0.0f, 1.0f);
|
||||
const float wy0 = 1.0f - wy1;
|
||||
for (int x = 0; x < out_w; ++x) {
|
||||
const float fx = ((float)x + 0.5f) / sx - 0.5f;
|
||||
int x0 = (int)std::floor(fx);
|
||||
int x1 = x0 + 1;
|
||||
x0 = std::clamp(x0, 0, n_grid - 1);
|
||||
x1 = std::clamp(x1, 0, n_grid - 1);
|
||||
float wx1 = std::clamp(fx - (float)x0, 0.0f, 1.0f);
|
||||
const float wx0 = 1.0f - wx1;
|
||||
|
||||
const float w00 = wy0 * wx0;
|
||||
const float w01 = wy0 * wx1;
|
||||
const float w10 = wy1 * wx0;
|
||||
const float w11 = wy1 * wx1;
|
||||
|
||||
const float * s00 = &src[((size_t)y0 * n_grid + x0) * n_embd];
|
||||
const float * s01 = &src[((size_t)y0 * n_grid + x1) * n_embd];
|
||||
const float * s10 = &src[((size_t)y1 * n_grid + x0) * n_embd];
|
||||
const float * s11 = &src[((size_t)y1 * n_grid + x1) * n_embd];
|
||||
float * d = &dst[((size_t)y * out_w + x) * n_embd];
|
||||
for (int c = 0; c < n_embd; ++c) {
|
||||
d[c] = w00 * s00[c] + w01 * s01[c] + w10 * s10[c] + w11 * s11[c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
set_input_f32("hunyuanvl_pos_embd", dst);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LLAMA4:
|
||||
{
|
||||
// set the 2D positions
|
||||
@@ -3760,6 +3839,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
case PROJECTOR_TYPE_YASA2:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
return ctx->model.mm_model_proj->ne[1];
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
return ctx->model.mm_4h_to_h_w->ne[1];
|
||||
|
||||
@@ -5,7 +5,21 @@ ggml_cgraph * clip_graph_hunyuanocr::build() {
|
||||
const int pw = n_patches_x;
|
||||
const int ph = n_patches_y;
|
||||
|
||||
ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
|
||||
// Position embedding interpolation.
|
||||
// HunyuanVL needs scale factors sf=(target+0.1)/n_grid, which the standard
|
||||
// ggml_interpolate cannot express. To avoid adding a new ggml op, the
|
||||
// resize is computed on CPU in clip_image_batch_encode and uploaded here
|
||||
// as a graph input (named "hunyuanvl_pos_embd").
|
||||
// HunyuanOCR uses the same square layout and the standard ratio-based
|
||||
// interpolation provided by resize_position_embeddings().
|
||||
ggml_tensor * pos_embd = nullptr;
|
||||
if (proj_type == PROJECTOR_TYPE_HUNYUANVL && model.position_embeddings) {
|
||||
pos_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ph * pw);
|
||||
ggml_set_name(pos_embd, "hunyuanvl_pos_embd");
|
||||
ggml_set_input(pos_embd);
|
||||
} else {
|
||||
pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
|
||||
}
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr);
|
||||
|
||||
+61
-3
@@ -35,15 +35,23 @@ struct mtmd_bitmap {
|
||||
|
||||
// position indexing for decoder model
|
||||
enum mtmd_pos_type {
|
||||
MTMD_POS_TYPE_NORMAL, // number of positions equals to number of tokens
|
||||
MTMD_POS_TYPE_MROPE, // qwen-vl mrope style, each image takes max(t,h,w) position indexes
|
||||
MTMD_POS_TYPE_NORMAL, // number of positions equals to number of tokens
|
||||
MTMD_POS_TYPE_MROPE, // qwen-vl mrope style, each image takes max(t,h,w) position indexes
|
||||
MTMD_POS_TYPE_HUNYUANVL, // HunyuanVL mrope + BOI/EOI/newline layout with XD-RoPE dim-3
|
||||
};
|
||||
|
||||
struct mtmd_image_tokens {
|
||||
uint32_t nx; // number of tokens in x direction
|
||||
uint32_t ny; // number of tokens in y direction
|
||||
mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL;
|
||||
uint32_t n_tokens() const { return nx * ny; }
|
||||
uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL)
|
||||
uint32_t n_tokens() const {
|
||||
if (pos == MTMD_POS_TYPE_HUNYUANVL) {
|
||||
// [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
|
||||
return (nx + 1) * ny + 2;
|
||||
}
|
||||
return nx * ny;
|
||||
}
|
||||
clip_image_f32_batch batch_f32; // preprocessed image patches
|
||||
std::string id; // optional user-defined ID, useful for KV cache tracking
|
||||
|
||||
@@ -52,6 +60,7 @@ struct mtmd_image_tokens {
|
||||
nx,
|
||||
ny,
|
||||
pos,
|
||||
image_idx,
|
||||
batch_f32.clone(),
|
||||
id
|
||||
};
|
||||
@@ -466,6 +475,7 @@ struct mtmd_context {
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
// note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
|
||||
img_beg = "<|hy_place▁holder▁no▁100|>";
|
||||
@@ -611,6 +621,7 @@ struct mtmd_tokenizer {
|
||||
const llama_vocab * vocab;
|
||||
|
||||
mtmd_input_chunks cur;
|
||||
uint32_t n_images_added = 0; // 0-based index assigned to the next image chunk
|
||||
|
||||
mtmd_tokenizer(mtmd_context * ctx,
|
||||
const mtmd_input_text * text,
|
||||
@@ -819,6 +830,14 @@ struct mtmd_tokenizer {
|
||||
image_tokens->ny = 1;
|
||||
}
|
||||
image_tokens->pos = ctx->pos_type;
|
||||
// HunyuanVL wraps the image grid with BOI/EOI and adds one newline per row,
|
||||
// and uses XD-RoPE (dim-3 = image index). Override the position type so that
|
||||
// n_tokens() and mtmd_image_tokens_get_decoder_pos pick the HunyuanVL layout.
|
||||
if (ctx->proj_type_v() == PROJECTOR_TYPE_HUNYUANVL) {
|
||||
image_tokens->pos = MTMD_POS_TYPE_HUNYUANVL;
|
||||
image_tokens->image_idx = n_images_added;
|
||||
GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens());
|
||||
}
|
||||
image_tokens->batch_f32 = std::move(batch_f32);
|
||||
image_tokens->id = bitmap->id; // optional
|
||||
|
||||
@@ -839,6 +858,9 @@ struct mtmd_tokenizer {
|
||||
add_text(ctx->img_end, true); // add image end token
|
||||
}
|
||||
|
||||
// advance image-chunk counter so the next image gets the next XD-RoPE dim-3 slot
|
||||
n_images_added++;
|
||||
|
||||
} else {
|
||||
// handle audio
|
||||
|
||||
@@ -1286,6 +1308,38 @@ mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * ima
|
||||
pos.y = pos_0 + i;
|
||||
pos.z = pos_0 + i;
|
||||
} break;
|
||||
case MTMD_POS_TYPE_HUNYUANVL:
|
||||
{
|
||||
// HunyuanVL layout: [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
|
||||
// Total = 1 + ny*(nx+1) + 1. BOI and EOI use sequential positions in every dim;
|
||||
// content and row-newline tokens use (row, col) with XD-RoPE dim-3 = image_idx.
|
||||
const uint32_t nx = image_tokens->nx;
|
||||
const uint32_t n_total = image_tokens->n_tokens();
|
||||
if (i == 0) {
|
||||
// BOI
|
||||
pos.t = pos_0 + i;
|
||||
pos.x = pos_0 + i;
|
||||
pos.y = pos_0 + i;
|
||||
pos.z = pos_0 + i;
|
||||
} else if (i == n_total - 1) {
|
||||
// EOI
|
||||
pos.t = pos_0 + i;
|
||||
pos.x = pos_0 + i;
|
||||
pos.y = pos_0 + i;
|
||||
pos.z = pos_0 + i;
|
||||
} else {
|
||||
// content token at (row, col), or the trailing newline of a row (col == nx)
|
||||
// section 0 = sequential, section 1 = w(col), section 2 = h(row), section 3 = image_count.
|
||||
// set_position_mrope_2d writes .y -> section 1 and .x -> section 2
|
||||
const uint32_t offset = (uint32_t)i - 1;
|
||||
const uint32_t row = offset / (nx + 1);
|
||||
const uint32_t col = offset % (nx + 1);
|
||||
pos.t = pos_0 + i;
|
||||
pos.x = row;
|
||||
pos.y = col;
|
||||
pos.z = image_tokens->image_idx;
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("invalid position type");
|
||||
}
|
||||
@@ -1302,6 +1356,10 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
|
||||
return std::max(image_tokens->nx, image_tokens->ny);
|
||||
case MTMD_POS_TYPE_NORMAL:
|
||||
return image_tokens->n_tokens();
|
||||
case MTMD_POS_TYPE_HUNYUANVL:
|
||||
// HunyuanVL: the sequential (dim-0) position advances by the full token count
|
||||
// (includes BOI/EOI and row newline tokens), not by max(nx, ny)
|
||||
return image_tokens->n_tokens();
|
||||
default:
|
||||
GGML_ABORT("invalid position type");
|
||||
}
|
||||
|
||||
@@ -91,6 +91,7 @@ add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr
|
||||
add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR"
|
||||
add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR"
|
||||
add_test_vision "ggml-org/HunyuanVL-4B-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/gemma-4-E2B-it-GGUF:Q8_0" --jinja
|
||||
|
||||
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
|
||||
|
||||
Reference in New Issue
Block a user