mtmd, llama : Update HunyuanVL vision-language model support (#22037)
* mtmd, llama : add HunyuanVL vision-language model support - add LLM_ARCH_HUNYUAN_VL with M-RoPE (XD-RoPE) support - add PROJECTOR_TYPE_HUNYUANVL with PatchMerger vision encoder - add HunyuanVL-specific M-RoPE position encoding for image tokens - add GGUF conversion for HunyuanVL vision and text models - add smoke test in tools/mtmd/tests.sh * fix: fix HunyuanVL XD-RoPE h/w section order * fix: Remove redundant code * convert : fix HunyuanOCR / HunyuanVL conversion - Tested locally: both HunyuanOCR and HunyuanVL-4B convert to GGUF - successfully and produce correct inference output on Metal (F16 / Q8_0). * clip : fix -Werror=misleading-indentation in bilinear resize * fix CI: convert_hf_to_gguf type check error - convert_hf_to_gguf.py: give HunyuanVLTextModel.__init__ an explicit `dir_model: Path` parameter so ty can infer the type for load_hparams instead of reporting `Unknown | None`. --------- Co-authored-by: wendadawen <wendadawen@tencent.com>
This commit is contained in:
@@ -912,6 +912,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
|
||||
} break;
|
||||
@@ -1473,6 +1474,16 @@ struct clip_model_loader {
|
||||
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
|
||||
hparams.set_warmup_n_tokens(28*28);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
hparams.n_merge = 2;
|
||||
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
|
||||
hparams.image_resize_pad = false;
|
||||
hparams.ffn_op = FFN_GELU;
|
||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||
hparams.set_limit_image_tokens(256, 16384);
|
||||
hparams.set_warmup_n_tokens(32*32);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
// audio preprocessing params
|
||||
@@ -2222,6 +2233,7 @@ struct clip_model_loader {
|
||||
model.mm_eoi = get_tensor(TN_TOK_EOI);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
// proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
|
||||
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
@@ -2860,6 +2872,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->nx / params.patch_size) / 2;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
@@ -2879,6 +2892,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
return (img->ny / params.patch_size) / 2;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
@@ -3070,6 +3084,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
n_patches = h * (h + 1) + 1;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
int merge = ctx->model.hparams.n_merge;
|
||||
int ow = (img->nx / patch_size) / merge;
|
||||
@@ -3534,6 +3549,70 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
{
|
||||
// do nothing
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
// Compute the HunyuanVL 2D position embedding on CPU (with the
|
||||
// custom sf=(target+0.1)/n_grid bilinear sampling that the
|
||||
// reference implementation uses) and upload it to the graph
|
||||
// input declared in clip_graph_hunyuanocr::build().
|
||||
GGML_ASSERT(model.position_embeddings != nullptr);
|
||||
ggml_tensor * src_t = model.position_embeddings;
|
||||
const int64_t n_embd = src_t->ne[0];
|
||||
const int64_t n_pos = src_t->ne[1]; // = n_grid * n_grid
|
||||
const int n_grid = (int)std::lround(std::sqrt((double)n_pos));
|
||||
GGML_ASSERT((int64_t)n_grid * n_grid == n_pos);
|
||||
const int out_w = pos_w; // pw
|
||||
const int out_h = pos_h; // ph
|
||||
|
||||
// Pull weight to host.
|
||||
std::vector<float> src(n_embd * n_pos);
|
||||
ggml_backend_tensor_get(src_t, src.data(), 0, ggml_nbytes(src_t));
|
||||
|
||||
// Output layout matches ggml_new_tensor_2d(F32, n_embd, out_h*out_w):
|
||||
// ne[0] = n_embd (fastest), ne[1] = out_h*out_w
|
||||
// dst[(y*out_w + x) * n_embd + c]
|
||||
std::vector<float> dst((size_t)n_embd * out_h * out_w);
|
||||
|
||||
const float sx = (float)(out_w + 0.1f) / (float)n_grid;
|
||||
const float sy = (float)(out_h + 0.1f) / (float)n_grid;
|
||||
|
||||
for (int y = 0; y < out_h; ++y) {
|
||||
// Match ggml_compute_forward_upscale_f32 pixel-center
|
||||
// convention (align_corners=False): src_y = (y+0.5)/sy - 0.5.
|
||||
const float fy = ((float)y + 0.5f) / sy - 0.5f;
|
||||
int y0 = (int)std::floor(fy);
|
||||
int y1 = y0 + 1;
|
||||
y0 = std::clamp(y0, 0, n_grid - 1);
|
||||
y1 = std::clamp(y1, 0, n_grid - 1);
|
||||
float wy1 = std::clamp(fy - (float)y0, 0.0f, 1.0f);
|
||||
const float wy0 = 1.0f - wy1;
|
||||
for (int x = 0; x < out_w; ++x) {
|
||||
const float fx = ((float)x + 0.5f) / sx - 0.5f;
|
||||
int x0 = (int)std::floor(fx);
|
||||
int x1 = x0 + 1;
|
||||
x0 = std::clamp(x0, 0, n_grid - 1);
|
||||
x1 = std::clamp(x1, 0, n_grid - 1);
|
||||
float wx1 = std::clamp(fx - (float)x0, 0.0f, 1.0f);
|
||||
const float wx0 = 1.0f - wx1;
|
||||
|
||||
const float w00 = wy0 * wx0;
|
||||
const float w01 = wy0 * wx1;
|
||||
const float w10 = wy1 * wx0;
|
||||
const float w11 = wy1 * wx1;
|
||||
|
||||
const float * s00 = &src[((size_t)y0 * n_grid + x0) * n_embd];
|
||||
const float * s01 = &src[((size_t)y0 * n_grid + x1) * n_embd];
|
||||
const float * s10 = &src[((size_t)y1 * n_grid + x0) * n_embd];
|
||||
const float * s11 = &src[((size_t)y1 * n_grid + x1) * n_embd];
|
||||
float * d = &dst[((size_t)y * out_w + x) * n_embd];
|
||||
for (int c = 0; c < n_embd; ++c) {
|
||||
d[c] = w00 * s00[c] + w01 * s01[c] + w10 * s10[c] + w11 * s11[c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
set_input_f32("hunyuanvl_pos_embd", dst);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LLAMA4:
|
||||
{
|
||||
// set the 2D positions
|
||||
@@ -3760,6 +3839,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
case PROJECTOR_TYPE_YASA2:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
return ctx->model.mm_model_proj->ne[1];
|
||||
case PROJECTOR_TYPE_COGVLM:
|
||||
return ctx->model.mm_4h_to_h_w->ne[1];
|
||||
|
||||
Reference in New Issue
Block a user