model : support step3-vl-10b (#21287)

* feat: support step3-vl-10b * use fused QKV && mapping tensor in tensor_mapping.py * guard hardcoded params and drop crop metadata * get understand_projector_stride from global config * img_u8_resize_bilinear_to_f32 move in step3vl class * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * fix the \r\n mess * add width and heads to MmprojModel.set_gguf_parameters --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-04-08 15:51:31 +08:00
parent 97508acb17
commit 09343c0198
12 changed files with 537 additions and 4 deletions
@@ -1114,6 +1114,260 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
    return true;
 }

+//
+// mtmd_image_preprocessor_step3vl
+//
+
+void mtmd_image_preprocessor_step3vl::img_u8_resize_bilinear_to_f32(
+        const clip_image_u8 & src,
+        clip_image_f32 & dst,
+        int target_width,
+        int target_height,
+        const float mean[3],
+        const float std[3]) {
+    if (src.nx == target_width && src.ny == target_height) {
+        img_u8_to_f32(src, dst, mean, std);
+        return;
+    }
+
+    dst.nx = target_width;
+    dst.ny = target_height;
+    dst.buf.resize(3 * target_width * target_height);
+
+    const float scale_x = static_cast<float>(src.nx) / target_width;
+    const float scale_y = static_cast<float>(src.ny) / target_height;
+
+    for (int y = 0; y < target_height; ++y) {
+        const float src_y = (static_cast<float>(y) + 0.5f) * scale_y - 0.5f;
+        const int y0_floor = static_cast<int>(std::floor(src_y));
+        const int y0 = std::max(0, std::min(y0_floor, src.ny - 1));
+        const int y1 = std::max(0, std::min(y0_floor + 1, src.ny - 1));
+        const float ly = src_y - y0_floor;
+
+        for (int x = 0; x < target_width; ++x) {
+            const float src_x = (static_cast<float>(x) + 0.5f) * scale_x - 0.5f;
+            const int x0_floor = static_cast<int>(std::floor(src_x));
+            const int x0 = std::max(0, std::min(x0_floor, src.nx - 1));
+            const int x1 = std::max(0, std::min(x0_floor + 1, src.nx - 1));
+            const float lx = src_x - x0_floor;
+
+            const size_t idx00 = 3 * (y0 * src.nx + x0);
+            const size_t idx01 = 3 * (y0 * src.nx + x1);
+            const size_t idx10 = 3 * (y1 * src.nx + x0);
+            const size_t idx11 = 3 * (y1 * src.nx + x1);
+            const size_t idx_dst = 3 * (y * target_width + x);
+
+            for (int c = 0; c < 3; ++c) {
+                const float v00 = (static_cast<float>(src.buf[idx00 + c]) / 255.0f - mean[c]) / std[c];
+                const float v01 = (static_cast<float>(src.buf[idx01 + c]) / 255.0f - mean[c]) / std[c];
+                const float v10 = (static_cast<float>(src.buf[idx10 + c]) / 255.0f - mean[c]) / std[c];
+                const float v11 = (static_cast<float>(src.buf[idx11 + c]) / 255.0f - mean[c]) / std[c];
+
+                const float top = v00 + (v01 - v00) * lx;
+                const float bot = v10 + (v11 - v10) * lx;
+                dst.buf[idx_dst + c] = top + (bot - top) * ly;
+            }
+        }
+    }
+}
+
+int mtmd_image_preprocessor_step3vl::get_image_longest_edge(const clip_hparams & params) {
+    return params.image_longest_edge > 0 ? params.image_longest_edge : default_image_longest_edge;
+}
+
+int mtmd_image_preprocessor_step3vl::determine_window_size(const clip_hparams & params, int longer, int shorter) {
+    const int image_size = params.image_size;
+    const int crop_size  = default_image_crop_size;
+    const float aspect_ratio = static_cast<float>(longer) / shorter;
+
+    if (longer <= image_size) {
+        return aspect_ratio > small_aspect_ratio_limit ? shorter : 0;
+    }
+
+    return aspect_ratio > wide_aspect_ratio_limit ? std::min(shorter, crop_size) : crop_size;
+}
+
+int mtmd_image_preprocessor_step3vl::calc_crop_extent(int length, int window_size) {
+    const float ratio = static_cast<float>(length) / window_size;
+    if (ratio < 1.0f) {
+        return length;
+    }
+
+    const float decimal = ratio - std::floor(ratio);
+    const int rounded = decimal > crop_rounding_threshold
+        ? static_cast<int>(std::floor(ratio)) + 1
+        : static_cast<int>(std::floor(ratio));
+    return window_size * rounded;
+}
+
+std::vector<int> mtmd_image_preprocessor_step3vl::calc_grid(int length, int window_size) {
+    const int n = length <= window_size
+        ? 1
+        : static_cast<int>(std::ceil(static_cast<float>(length - window_size) / window_size + 1.0f));
+    std::vector<int> starts(n);
+
+    for (int i = 0; i < n; ++i) {
+        starts[i] = window_size * i;
+    }
+
+    if (n > 1 && starts.back() + window_size > length) {
+        starts.back() = length - window_size;
+    }
+
+    return starts;
+}
+
+clip_image_u8 mtmd_image_preprocessor_step3vl::prepare_image(const clip_image_u8 & img, const clip_hparams & params) {
+    clip_image_u8 resized = img;
+    const float aspect_ratio = img.ny > 0 ? static_cast<float>(img.nx) / img.ny : 1.0f;
+    if (std::min(img.nx, img.ny) < 32 &&
+        (aspect_ratio > wide_aspect_ratio_limit ||
+         aspect_ratio < 1.0f / wide_aspect_ratio_limit)) {
+        const int square_size = std::max(img.nx, img.ny);
+        clip_image_u8 padded;
+        padded.nx = square_size;
+        padded.ny = square_size;
+        padded.buf.resize(3 * square_size * square_size);
+        img_tool::fill(padded, {0, 0, 0});
+        img_tool::composite(padded, img, 0, 0);
+        resized = std::move(padded);
+    }
+
+    const int max_image_size = get_image_longest_edge(params);
+    if (std::max(resized.nx, resized.ny) > max_image_size) {
+        const float scale = static_cast<float>(max_image_size) / std::max(resized.nx, resized.ny);
+        const clip_image_size new_size = {
+            std::max(1, static_cast<int>(std::floor(resized.nx * scale))),
+            std::max(1, static_cast<int>(std::floor(resized.ny * scale))),
+        };
+        clip_image_u8 scaled;
+        img_tool::resize(resized, scaled, new_size, RESIZE_ALGO_BILINEAR, false);
+        resized = std::move(scaled);
+    }
+
+    return resized;
+}
+
+clip_image_u8 mtmd_image_preprocessor_step3vl::crop_with_black_padding(const clip_image_u8 & image, int x, int y, int w, int h) {
+    clip_image_u8 dst;
+    dst.nx = w;
+    dst.ny = h;
+    dst.buf.resize(3 * w * h, 0);
+
+    const int src_x0 = std::max(0, x);
+    const int src_y0 = std::max(0, y);
+    const int src_x1 = std::min(image.nx, x + w);
+    const int src_y1 = std::min(image.ny, y + h);
+
+    if (src_x0 >= src_x1 || src_y0 >= src_y1) {
+        return dst;
+    }
+
+    const int dst_x0 = src_x0 - x;
+    const int dst_y0 = src_y0 - y;
+
+    for (int yy = 0; yy < src_y1 - src_y0; ++yy) {
+        for (int xx = 0; xx < src_x1 - src_x0; ++xx) {
+            const int src_idx = 3 * ((src_y0 + yy) * image.nx + (src_x0 + xx));
+            const int dst_idx = 3 * ((dst_y0 + yy) * w + (dst_x0 + xx));
+            dst.buf[dst_idx + 0] = image.buf[src_idx + 0];
+            dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
+            dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
+        }
+    }
+
+    return dst;
+}
+
+mtmd_image_preprocessor_step3vl::slice_instructions mtmd_image_preprocessor_step3vl::build_slice_instructions(
+        const clip_hparams & params,
+        const clip_image_size & prepared_size) {
+    slice_instructions instructions;
+    instructions.overview_size = prepared_size;
+
+    const int window_size = determine_window_size(
+        params,
+        std::max(prepared_size.width, prepared_size.height),
+        std::min(prepared_size.width, prepared_size.height));
+    if (window_size <= 0) {
+        instructions.refined_size = clip_image_size{0, 0};
+        instructions.grid_size    = clip_image_size{0, 0};
+        return instructions;
+    }
+
+    const int crop_width  = calc_crop_extent(prepared_size.width,  window_size);
+    const int crop_height = calc_crop_extent(prepared_size.height, window_size);
+    instructions.refined_size = clip_image_size{crop_width, crop_height};
+
+    const auto xs = calc_grid(crop_width,  window_size);
+    const auto ys = calc_grid(crop_height, window_size);
+    instructions.grid_size = clip_image_size{
+        static_cast<int>(xs.size()),
+        static_cast<int>(ys.size()),
+    };
+
+    for (int y : ys) {
+        for (int x : xs) {
+            instructions.slices.push_back(slice_coordinates{
+                /* x    */ x,
+                /* y    */ y,
+                /* size */ clip_image_size{window_size, window_size},
+            });
+        }
+    }
+
+    return instructions;
+}
+
+bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    clip_image_u8 prepared = prepare_image(img, hparams);
+    const auto instructions = build_slice_instructions(hparams, {prepared.nx, prepared.ny});
+
+    clip_image_f32_ptr overview_f32(clip_image_f32_init());
+    img_u8_resize_bilinear_to_f32(
+        prepared,
+        *overview_f32,
+        hparams.image_size,
+        hparams.image_size,
+        hparams.image_mean,
+        hparams.image_std);
+    output.entries.push_back(std::move(overview_f32));
+
+    if (instructions.slices.empty()) {
+        output.grid_x = 0;
+        output.grid_y = 0;
+        return true;
+    }
+
+    clip_image_u8 img_for_crop = prepared;
+    if (instructions.refined_size.width != prepared.nx || instructions.refined_size.height != prepared.ny) {
+        clip_image_u8 refined;
+        img_tool::resize(prepared, refined, instructions.refined_size, RESIZE_ALGO_BILINEAR, false);
+        img_for_crop = std::move(refined);
+    }
+
+    const int crop_size = default_image_crop_size;
+    for (const auto & slice : instructions.slices) {
+        // If the requested patch extends past the source image, pad the out-of-bounds area with black.
+        clip_image_u8 patch = crop_with_black_padding(img_for_crop, slice.x, slice.y, slice.size.width, slice.size.height);
+
+        clip_image_f32_ptr patch_f32(clip_image_f32_init());
+        img_u8_resize_bilinear_to_f32(
+            patch,
+            *patch_f32,
+            crop_size,
+            crop_size,
+            hparams.image_mean,
+            hparams.image_std);
+        output.entries.push_back(std::move(patch_f32));
+    }
+
+    output.grid_x = instructions.grid_size.width;
+    output.grid_y = instructions.grid_size.height;
+
+    return true;
+}
+
 //
 // mtmd_image_preprocessor_youtuvl
 //