mtmd, llama : Update HunyuanVL vision-language model support (#22037)

* mtmd, llama : add HunyuanVL vision-language model support

- add LLM_ARCH_HUNYUAN_VL with M-RoPE (XD-RoPE) support
- add PROJECTOR_TYPE_HUNYUANVL with PatchMerger vision encoder
- add HunyuanVL-specific M-RoPE position encoding for image tokens
- add GGUF conversion for HunyuanVL vision and text models
- add smoke test in tools/mtmd/tests.sh

* fix: fix HunyuanVL XD-RoPE h/w section order

* fix: Remove redundant code

* convert : fix HunyuanOCR / HunyuanVL conversion
 - Tested locally: both HunyuanOCR and HunyuanVL-4B convert to GGUF
 - successfully and produce correct inference output on Metal (F16 / Q8_0).

* clip : fix -Werror=misleading-indentation in bilinear resize

* fix CI: convert_hf_to_gguf type check error
 - convert_hf_to_gguf.py: give HunyuanVLTextModel.__init__ an explicit `dir_model: Path` parameter so ty can infer the type for load_hparams instead of reporting `Unknown | None`.

---------

Co-authored-by: wendadawen <wendadawen@tencent.com>
This commit is contained in:
manayang
2026-04-22 17:58:43 +08:00
committed by GitHub
parent 750579ff14
commit 7bfe60fdf9
13 changed files with 336 additions and 27 deletions
+80
View File
@@ -912,6 +912,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_HUNYUANVL:
{
builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
} break;
@@ -1473,6 +1474,16 @@ struct clip_model_loader {
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
hparams.set_warmup_n_tokens(28*28);
} break;
case PROJECTOR_TYPE_HUNYUANVL:
{
hparams.n_merge = 2;
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
hparams.image_resize_pad = false;
hparams.ffn_op = FFN_GELU;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
hparams.set_limit_image_tokens(256, 16384);
hparams.set_warmup_n_tokens(32*32);
} break;
case PROJECTOR_TYPE_LFM2A:
{
// audio preprocessing params
@@ -2222,6 +2233,7 @@ struct clip_model_loader {
model.mm_eoi = get_tensor(TN_TOK_EOI);
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_HUNYUANVL:
{
// proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
@@ -2860,6 +2872,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_PADDLEOCR:
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_HUNYUANVL:
case PROJECTOR_TYPE_YOUTUVL:
return (img->nx / params.patch_size) / 2;
case PROJECTOR_TYPE_STEP3VL:
@@ -2879,6 +2892,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_PADDLEOCR:
case PROJECTOR_TYPE_HUNYUANVL:
case PROJECTOR_TYPE_YOUTUVL:
return (img->ny / params.patch_size) / 2;
case PROJECTOR_TYPE_STEP3VL:
@@ -3070,6 +3084,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
n_patches = h * (h + 1) + 1;
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_HUNYUANVL:
{
int merge = ctx->model.hparams.n_merge;
int ow = (img->nx / patch_size) / merge;
@@ -3534,6 +3549,70 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
{
// do nothing
} break;
case PROJECTOR_TYPE_HUNYUANVL:
{
// Compute the HunyuanVL 2D position embedding on CPU (with the
// custom sf=(target+0.1)/n_grid bilinear sampling that the
// reference implementation uses) and upload it to the graph
// input declared in clip_graph_hunyuanocr::build().
GGML_ASSERT(model.position_embeddings != nullptr);
ggml_tensor * src_t = model.position_embeddings;
const int64_t n_embd = src_t->ne[0];
const int64_t n_pos = src_t->ne[1]; // = n_grid * n_grid
const int n_grid = (int)std::lround(std::sqrt((double)n_pos));
GGML_ASSERT((int64_t)n_grid * n_grid == n_pos);
const int out_w = pos_w; // pw
const int out_h = pos_h; // ph
// Pull weight to host.
std::vector<float> src(n_embd * n_pos);
ggml_backend_tensor_get(src_t, src.data(), 0, ggml_nbytes(src_t));
// Output layout matches ggml_new_tensor_2d(F32, n_embd, out_h*out_w):
// ne[0] = n_embd (fastest), ne[1] = out_h*out_w
// dst[(y*out_w + x) * n_embd + c]
std::vector<float> dst((size_t)n_embd * out_h * out_w);
const float sx = (float)(out_w + 0.1f) / (float)n_grid;
const float sy = (float)(out_h + 0.1f) / (float)n_grid;
for (int y = 0; y < out_h; ++y) {
// Match ggml_compute_forward_upscale_f32 pixel-center
// convention (align_corners=False): src_y = (y+0.5)/sy - 0.5.
const float fy = ((float)y + 0.5f) / sy - 0.5f;
int y0 = (int)std::floor(fy);
int y1 = y0 + 1;
y0 = std::clamp(y0, 0, n_grid - 1);
y1 = std::clamp(y1, 0, n_grid - 1);
float wy1 = std::clamp(fy - (float)y0, 0.0f, 1.0f);
const float wy0 = 1.0f - wy1;
for (int x = 0; x < out_w; ++x) {
const float fx = ((float)x + 0.5f) / sx - 0.5f;
int x0 = (int)std::floor(fx);
int x1 = x0 + 1;
x0 = std::clamp(x0, 0, n_grid - 1);
x1 = std::clamp(x1, 0, n_grid - 1);
float wx1 = std::clamp(fx - (float)x0, 0.0f, 1.0f);
const float wx0 = 1.0f - wx1;
const float w00 = wy0 * wx0;
const float w01 = wy0 * wx1;
const float w10 = wy1 * wx0;
const float w11 = wy1 * wx1;
const float * s00 = &src[((size_t)y0 * n_grid + x0) * n_embd];
const float * s01 = &src[((size_t)y0 * n_grid + x1) * n_embd];
const float * s10 = &src[((size_t)y1 * n_grid + x0) * n_embd];
const float * s11 = &src[((size_t)y1 * n_grid + x1) * n_embd];
float * d = &dst[((size_t)y * out_w + x) * n_embd];
for (int c = 0; c < n_embd; ++c) {
d[c] = w00 * s00[c] + w01 * s01[c] + w10 * s10[c] + w11 * s11[c];
}
}
}
set_input_f32("hunyuanvl_pos_embd", dst);
} break;
case PROJECTOR_TYPE_LLAMA4:
{
// set the 2D positions
@@ -3760,6 +3839,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_YASA2:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_HUNYUANVL:
return ctx->model.mm_model_proj->ne[1];
case PROJECTOR_TYPE_COGVLM:
return ctx->model.mm_4h_to_h_w->ne[1];