model : support step3-vl-10b (#21287)
* feat: support step3-vl-10b * use fused QKV && mapping tensor in tensor_mapping.py * guard hardcoded params and drop crop metadata * get understand_projector_stride from global config * img_u8_resize_bilinear_to_f32 move in step3vl class * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * fix the \r\n mess * add width and heads to MmprojModel.set_gguf_parameters --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
+18
-1
@@ -88,6 +88,7 @@ enum mtmd_slice_tmpl {
|
||||
MTMD_SLICE_TMPL_LLAMA4,
|
||||
MTMD_SLICE_TMPL_IDEFICS3,
|
||||
MTMD_SLICE_TMPL_LFM2,
|
||||
MTMD_SLICE_TMPL_STEP3VL,
|
||||
};
|
||||
|
||||
const char * mtmd_default_marker() {
|
||||
@@ -259,7 +260,6 @@ struct mtmd_context {
|
||||
tok_row_end = {lookup_token("\n")};
|
||||
tok_row_end_trail = false; // no trailing end-of-row token
|
||||
ov_img_first = true;
|
||||
|
||||
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
|
||||
// minicpmv 2.6 format:
|
||||
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
||||
@@ -331,6 +331,22 @@ struct mtmd_context {
|
||||
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
{
|
||||
// Step3 format:
|
||||
// <patch_start> (patch) <patch_end> [<patch_newline>]
|
||||
// ... (all patch rows)
|
||||
// <im_start> (overview) <im_end>
|
||||
slice_tmpl = MTMD_SLICE_TMPL_STEP3VL;
|
||||
tok_ov_img_start = {lookup_token("<im_start>")};
|
||||
tok_ov_img_end = {lookup_token("<im_end>")};
|
||||
tok_sli_img_start = {lookup_token("<patch_start>")};
|
||||
tok_sli_img_end = {lookup_token("<patch_end>")};
|
||||
tok_row_end = {lookup_token("<patch_newline>")};
|
||||
tok_row_end_trail = false;
|
||||
ov_img_first = false; // patches first, overview last
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_step3vl>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
{
|
||||
// <img> ... (image embeddings) ... </img>
|
||||
@@ -682,6 +698,7 @@ struct mtmd_tokenizer {
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL
|
||||
|| (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid)
|
||||
) {
|
||||
const int n_col = batch_f32.grid_x;
|
||||
|
||||
Reference in New Issue
Block a user