model : support step3-vl-10b (#21287)

* feat: support step3-vl-10b

* use fused QKV && mapping tensor in tensor_mapping.py

* guard hardcoded params and drop crop metadata

* get understand_projector_stride from global config

* img_u8_resize_bilinear_to_f32 move in step3vl class

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* fix the \r\n mess

* add width and heads to MmprojModel.set_gguf_parameters

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
forforever73
2026-04-08 15:51:31 +08:00
committed by GitHub
parent 97508acb17
commit 09343c0198
12 changed files with 537 additions and 4 deletions
+18 -1
View File
@@ -88,6 +88,7 @@ enum mtmd_slice_tmpl {
MTMD_SLICE_TMPL_LLAMA4,
MTMD_SLICE_TMPL_IDEFICS3,
MTMD_SLICE_TMPL_LFM2,
MTMD_SLICE_TMPL_STEP3VL,
};
const char * mtmd_default_marker() {
@@ -259,7 +260,6 @@ struct mtmd_context {
tok_row_end = {lookup_token("\n")};
tok_row_end_trail = false; // no trailing end-of-row token
ov_img_first = true;
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
// minicpmv 2.6 format:
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
@@ -331,6 +331,22 @@ struct mtmd_context {
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
} break;
case PROJECTOR_TYPE_STEP3VL:
{
// Step3 format:
// <patch_start> (patch) <patch_end> [<patch_newline>]
// ... (all patch rows)
// <im_start> (overview) <im_end>
slice_tmpl = MTMD_SLICE_TMPL_STEP3VL;
tok_ov_img_start = {lookup_token("<im_start>")};
tok_ov_img_end = {lookup_token("<im_end>")};
tok_sli_img_start = {lookup_token("<patch_start>")};
tok_sli_img_end = {lookup_token("<patch_end>")};
tok_row_end = {lookup_token("<patch_newline>")};
tok_row_end_trail = false;
ov_img_first = false; // patches first, overview last
image_preproc = std::make_unique<mtmd_image_preprocessor_step3vl>(ctx_v);
} break;
case PROJECTOR_TYPE_INTERNVL:
{
// <img> ... (image embeddings) ... </img>
@@ -682,6 +698,7 @@ struct mtmd_tokenizer {
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
|| (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
) {
const int n_col = batch_f32.grid_x;