mtmd, llama : Update HunyuanVL vision-language model support (#22037)

* mtmd, llama : add HunyuanVL vision-language model support

- add LLM_ARCH_HUNYUAN_VL with M-RoPE (XD-RoPE) support
- add PROJECTOR_TYPE_HUNYUANVL with PatchMerger vision encoder
- add HunyuanVL-specific M-RoPE position encoding for image tokens
- add GGUF conversion for HunyuanVL vision and text models
- add smoke test in tools/mtmd/tests.sh

* fix: fix HunyuanVL XD-RoPE h/w section order

* fix: Remove redundant code

* convert : fix HunyuanOCR / HunyuanVL conversion
 - Tested locally: both HunyuanOCR and HunyuanVL-4B convert to GGUF
 - successfully and produce correct inference output on Metal (F16 / Q8_0).

* clip : fix -Werror=misleading-indentation in bilinear resize

* fix CI: convert_hf_to_gguf type check error
 - convert_hf_to_gguf.py: give HunyuanVLTextModel.__init__ an explicit `dir_model: Path` parameter so ty can infer the type for load_hparams instead of reporting `Unknown | None`.

---------

Co-authored-by: wendadawen <wendadawen@tencent.com>
This commit is contained in:
manayang
2026-04-22 17:58:43 +08:00
committed by GitHub
parent 750579ff14
commit 7bfe60fdf9
13 changed files with 336 additions and 27 deletions
+61 -3
View File
@@ -35,15 +35,23 @@ struct mtmd_bitmap {
// position indexing for decoder model
enum mtmd_pos_type {
MTMD_POS_TYPE_NORMAL, // number of positions equals to number of tokens
MTMD_POS_TYPE_MROPE, // qwen-vl mrope style, each image takes max(t,h,w) position indexes
MTMD_POS_TYPE_NORMAL, // number of positions equals to number of tokens
MTMD_POS_TYPE_MROPE, // qwen-vl mrope style, each image takes max(t,h,w) position indexes
MTMD_POS_TYPE_HUNYUANVL, // HunyuanVL mrope + BOI/EOI/newline layout with XD-RoPE dim-3
};
struct mtmd_image_tokens {
uint32_t nx; // number of tokens in x direction
uint32_t ny; // number of tokens in y direction
mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL;
uint32_t n_tokens() const { return nx * ny; }
uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL)
uint32_t n_tokens() const {
if (pos == MTMD_POS_TYPE_HUNYUANVL) {
// [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
return (nx + 1) * ny + 2;
}
return nx * ny;
}
clip_image_f32_batch batch_f32; // preprocessed image patches
std::string id; // optional user-defined ID, useful for KV cache tracking
@@ -52,6 +60,7 @@ struct mtmd_image_tokens {
nx,
ny,
pos,
image_idx,
batch_f32.clone(),
id
};
@@ -466,6 +475,7 @@ struct mtmd_context {
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_HUNYUANVL:
{
// note: these use fullwidth (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
img_beg = "<hy_place▁holder▁no▁100>";
@@ -611,6 +621,7 @@ struct mtmd_tokenizer {
const llama_vocab * vocab;
mtmd_input_chunks cur;
uint32_t n_images_added = 0; // 0-based index assigned to the next image chunk
mtmd_tokenizer(mtmd_context * ctx,
const mtmd_input_text * text,
@@ -819,6 +830,14 @@ struct mtmd_tokenizer {
image_tokens->ny = 1;
}
image_tokens->pos = ctx->pos_type;
// HunyuanVL wraps the image grid with BOI/EOI and adds one newline per row,
// and uses XD-RoPE (dim-3 = image index). Override the position type so that
// n_tokens() and mtmd_image_tokens_get_decoder_pos pick the HunyuanVL layout.
if (ctx->proj_type_v() == PROJECTOR_TYPE_HUNYUANVL) {
image_tokens->pos = MTMD_POS_TYPE_HUNYUANVL;
image_tokens->image_idx = n_images_added;
GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens());
}
image_tokens->batch_f32 = std::move(batch_f32);
image_tokens->id = bitmap->id; // optional
@@ -839,6 +858,9 @@ struct mtmd_tokenizer {
add_text(ctx->img_end, true); // add image end token
}
// advance image-chunk counter so the next image gets the next XD-RoPE dim-3 slot
n_images_added++;
} else {
// handle audio
@@ -1286,6 +1308,38 @@ mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * ima
pos.y = pos_0 + i;
pos.z = pos_0 + i;
} break;
case MTMD_POS_TYPE_HUNYUANVL:
{
// HunyuanVL layout: [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
// Total = 1 + ny*(nx+1) + 1. BOI and EOI use sequential positions in every dim;
// content and row-newline tokens use (row, col) with XD-RoPE dim-3 = image_idx.
const uint32_t nx = image_tokens->nx;
const uint32_t n_total = image_tokens->n_tokens();
if (i == 0) {
// BOI
pos.t = pos_0 + i;
pos.x = pos_0 + i;
pos.y = pos_0 + i;
pos.z = pos_0 + i;
} else if (i == n_total - 1) {
// EOI
pos.t = pos_0 + i;
pos.x = pos_0 + i;
pos.y = pos_0 + i;
pos.z = pos_0 + i;
} else {
// content token at (row, col), or the trailing newline of a row (col == nx)
// section 0 = sequential, section 1 = w(col), section 2 = h(row), section 3 = image_count.
// set_position_mrope_2d writes .y -> section 1 and .x -> section 2
const uint32_t offset = (uint32_t)i - 1;
const uint32_t row = offset / (nx + 1);
const uint32_t col = offset % (nx + 1);
pos.t = pos_0 + i;
pos.x = row;
pos.y = col;
pos.z = image_tokens->image_idx;
}
} break;
default:
GGML_ABORT("invalid position type");
}
@@ -1302,6 +1356,10 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
return std::max(image_tokens->nx, image_tokens->ny);
case MTMD_POS_TYPE_NORMAL:
return image_tokens->n_tokens();
case MTMD_POS_TYPE_HUNYUANVL:
// HunyuanVL: the sequential (dim-0) position advances by the full token count
// (includes BOI/EOI and row newline tokens), not by max(nx, ny)
return image_tokens->n_tokens();
default:
GGML_ABORT("invalid position type");
}