mtmd, llama : Update HunyuanVL vision-language model support (#22037)

* mtmd, llama : add HunyuanVL vision-language model support - add LLM_ARCH_HUNYUAN_VL with M-RoPE (XD-RoPE) support - add PROJECTOR_TYPE_HUNYUANVL with PatchMerger vision encoder - add HunyuanVL-specific M-RoPE position encoding for image tokens - add GGUF conversion for HunyuanVL vision and text models - add smoke test in tools/mtmd/tests.sh * fix: fix HunyuanVL XD-RoPE h/w section order * fix: Remove redundant code * convert : fix HunyuanOCR / HunyuanVL conversion - Tested locally: both HunyuanOCR and HunyuanVL-4B convert to GGUF - successfully and produce correct inference output on Metal (F16 / Q8_0). * clip : fix -Werror=misleading-indentation in bilinear resize * fix CI: convert_hf_to_gguf type check error - convert_hf_to_gguf.py: give HunyuanVLTextModel.__init__ an explicit `dir_model: Path` parameter so ty can infer the type for load_hparams instead of reporting `Unknown | None`. --------- Co-authored-by: wendadawen <wendadawen@tencent.com>
2026-04-22 17:58:43 +08:00
parent 750579ff14
commit 7bfe60fdf9
13 changed files with 336 additions and 27 deletions
@@ -35,15 +35,23 @@ struct mtmd_bitmap {

 // position indexing for decoder model
 enum mtmd_pos_type {
-    MTMD_POS_TYPE_NORMAL, // number of positions equals to number of tokens
-    MTMD_POS_TYPE_MROPE, // qwen-vl mrope style, each image takes max(t,h,w) position indexes
+    MTMD_POS_TYPE_NORMAL,    // number of positions equals to number of tokens
+    MTMD_POS_TYPE_MROPE,     // qwen-vl mrope style, each image takes max(t,h,w) position indexes
+    MTMD_POS_TYPE_HUNYUANVL, // HunyuanVL mrope + BOI/EOI/newline layout with XD-RoPE dim-3
 };

 struct mtmd_image_tokens {
    uint32_t nx; // number of tokens in x direction
    uint32_t ny; // number of tokens in y direction
    mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL;
-    uint32_t n_tokens() const { return nx * ny; }
+    uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL)
+    uint32_t n_tokens() const {
+        if (pos == MTMD_POS_TYPE_HUNYUANVL) {
+            // [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
+            return (nx + 1) * ny + 2;
+        }
+        return nx * ny;
+    }
    clip_image_f32_batch batch_f32; // preprocessed image patches
    std::string id; // optional user-defined ID, useful for KV cache tracking

@@ -52,6 +60,7 @@ struct mtmd_image_tokens {
            nx,
            ny,
            pos,
+            image_idx,
            batch_f32.clone(),
            id
        };
@@ -466,6 +475,7 @@ struct mtmd_context {
                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
                } break;
            case PROJECTOR_TYPE_HUNYUANOCR:
+            case PROJECTOR_TYPE_HUNYUANVL:
                {
                    // note: these use fullwidth ｜ (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
                    img_beg = "<｜hy_place▁holder▁no▁100｜>";
@@ -611,6 +621,7 @@ struct mtmd_tokenizer {
    const llama_vocab * vocab;

    mtmd_input_chunks cur;
+    uint32_t n_images_added = 0; // 0-based index assigned to the next image chunk

    mtmd_tokenizer(mtmd_context * ctx,
            const mtmd_input_text * text,
@@ -819,6 +830,14 @@ struct mtmd_tokenizer {
                    image_tokens->ny = 1;
                }
                image_tokens->pos = ctx->pos_type;
+                // HunyuanVL wraps the image grid with BOI/EOI and adds one newline per row,
+                // and uses XD-RoPE (dim-3 = image index). Override the position type so that
+                // n_tokens() and mtmd_image_tokens_get_decoder_pos pick the HunyuanVL layout.
+                if (ctx->proj_type_v() == PROJECTOR_TYPE_HUNYUANVL) {
+                    image_tokens->pos       = MTMD_POS_TYPE_HUNYUANVL;
+                    image_tokens->image_idx = n_images_added;
+                    GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens());
+                }
                image_tokens->batch_f32 = std::move(batch_f32);
                image_tokens->id = bitmap->id; // optional

@@ -839,6 +858,9 @@ struct mtmd_tokenizer {
                add_text(ctx->img_end, true); // add image end token
            }

+            // advance image-chunk counter so the next image gets the next XD-RoPE dim-3 slot
+            n_images_added++;
+
        } else {
            // handle audio

@@ -1286,6 +1308,38 @@ mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * ima
                pos.y = pos_0 + i;
                pos.z = pos_0 + i;
            } break;
+        case MTMD_POS_TYPE_HUNYUANVL:
+            {
+                // HunyuanVL layout: [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
+                // Total = 1 + ny*(nx+1) + 1. BOI and EOI use sequential positions in every dim;
+                // content and row-newline tokens use (row, col) with XD-RoPE dim-3 = image_idx.
+                const uint32_t nx      = image_tokens->nx;
+                const uint32_t n_total = image_tokens->n_tokens();
+                if (i == 0) {
+                    // BOI
+                    pos.t = pos_0 + i;
+                    pos.x = pos_0 + i;
+                    pos.y = pos_0 + i;
+                    pos.z = pos_0 + i;
+                } else if (i == n_total - 1) {
+                    // EOI
+                    pos.t = pos_0 + i;
+                    pos.x = pos_0 + i;
+                    pos.y = pos_0 + i;
+                    pos.z = pos_0 + i;
+                } else {
+                    // content token at (row, col), or the trailing newline of a row (col == nx)
+                    //   section 0 = sequential, section 1 = w(col), section 2 = h(row), section 3 = image_count.
+                    // set_position_mrope_2d writes .y -> section 1 and .x -> section 2
+                    const uint32_t offset = (uint32_t)i - 1;
+                    const uint32_t row    = offset / (nx + 1);
+                    const uint32_t col    = offset % (nx + 1);
+                    pos.t = pos_0 + i;
+                    pos.x = row;
+                    pos.y = col;
+                    pos.z = image_tokens->image_idx;
+                }
+            } break;
        default:
            GGML_ABORT("invalid position type");
    }
@@ -1302,6 +1356,10 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
            return std::max(image_tokens->nx, image_tokens->ny);
        case MTMD_POS_TYPE_NORMAL:
            return image_tokens->n_tokens();
+        case MTMD_POS_TYPE_HUNYUANVL:
+            // HunyuanVL: the sequential (dim-0) position advances by the full token count
+            // (includes BOI/EOI and row newline tokens), not by max(nx, ny)
+            return image_tokens->n_tokens();
        default:
            GGML_ABORT("invalid position type");
    }