model : support step3-vl-10b (#21287)

* feat: support step3-vl-10b

* use fused QKV && mapping tensor in tensor_mapping.py

* guard hardcoded params and drop crop metadata

* get understand_projector_stride from global config

* img_u8_resize_bilinear_to_f32 move in step3vl class

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* fix the \r\n mess

* add width and heads to MmprojModel.set_gguf_parameters

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
forforever73
2026-04-08 15:51:31 +08:00
committed by GitHub
parent 97508acb17
commit 09343c0198
12 changed files with 537 additions and 4 deletions
+47
View File
@@ -862,6 +862,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_qwen3vl>(ctx, img);
} break;
case PROJECTOR_TYPE_STEP3VL:
{
builder = std::make_unique<clip_graph_step3vl>(ctx, img);
} break;
case PROJECTOR_TYPE_MINICPMV:
{
builder = std::make_unique<clip_graph_minicpmv>(ctx, img);
@@ -1337,6 +1341,17 @@ struct clip_model_loader {
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
}
} break;
case PROJECTOR_TYPE_STEP3VL:
{
hparams.n_merge = 4; // two stride-2 downsamplers after patching
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
hparams.rope_theta = 10000.0f;
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
if (hparams.image_longest_edge == 0) {
hparams.image_longest_edge = 3024;
}
hparams.warmup_image_size = hparams.image_size;
} break;
case PROJECTOR_TYPE_YOUTUVL:
{
hparams.n_merge = 2;
@@ -1769,6 +1784,14 @@ struct clip_model_loader {
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_STEP3VL:
{
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"), false);
model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
} break;
case PROJECTOR_TYPE_YOUTUVL:
{
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); // merger.ln_q (RMS norm)
@@ -2615,6 +2638,8 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_YOUTUVL:
return (img->nx / params.patch_size) / 2;
case PROJECTOR_TYPE_STEP3VL:
return img->nx / (params.patch_size * params.n_merge);
default:
break;
}
@@ -2632,6 +2657,8 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_PADDLEOCR:
case PROJECTOR_TYPE_YOUTUVL:
return (img->ny / params.patch_size) / 2;
case PROJECTOR_TYPE_STEP3VL:
return img->ny / (params.patch_size * params.n_merge);
default:
break;
}
@@ -2702,6 +2729,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
int y_patch = img->ny / (params.patch_size * 2);
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_STEP3VL:
{
int x_patch = img->nx / (params.patch_size * params.n_merge);
int y_patch = img->ny / (params.patch_size * params.n_merge);
n_patches = x_patch * y_patch;
} break;
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA4V:
case PROJECTOR_TYPE_IDEFICS3:
@@ -3004,6 +3037,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_STEP3VL:
{
std::vector<int32_t> pos_data(n_pos);
for (int i = 0; i < n_pos; i++) {
pos_data[i] = i / pos_w;
}
set_input_i32("pos_h", pos_data);
for (int i = 0; i < n_pos; i++) {
pos_data[i] = i % pos_w;
}
set_input_i32("pos_w", pos_data);
} break;
case PROJECTOR_TYPE_PADDLEOCR:
{
const int merge_ratio = hparams.n_merge;
@@ -3358,6 +3403,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_QWEN3VL:
// main path + deepstack paths
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
case PROJECTOR_TYPE_STEP3VL:
return ctx->model.mm_model_proj->ne[1];
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA3NV:
return ctx->model.mm_input_proj_w->ne[0];