mtmd: refactor image preprocessing (#21031)

* mtmd: refactor image pre-processing * correct some places * correct lfm2 * fix deepseek-ocr on server * add comment to clarify about mtmd_image_preprocessor_dyn_size
2026-03-26 19:49:20 +01:00
parent ded446b34c
commit a73bbd5d92
8 changed files with 1602 additions and 1539 deletions
@@ -28,6 +28,13 @@ enum patch_merge_type {
    PATCH_MERGE_SPATIAL_UNPAD,
 };

+enum resize_algo {
+    RESIZE_ALGO_BILINEAR, // stretch to target resolution
+    RESIZE_ALGO_BICUBIC, // center-crop when aspect ratio doesn't match
+    RESIZE_ALGO_BICUBIC_PILLOW,
+    // RESIZE_ALGO_LANCZOS, // TODO
+};
+
 struct clip_hparams {
    int32_t image_size = 0;
    int32_t patch_size = 0;
@@ -37,13 +44,26 @@ struct clip_hparams {
    int32_t n_head = 0;
    int32_t n_layer = 0;
    // idefics3
+    int32_t n_merge = 0; // number of patch merges **per-side**
+
+    // for preprocessor
    int32_t image_longest_edge = 0;
    int32_t image_min_pixels = -1;
    int32_t image_max_pixels = -1;
-    int32_t n_merge = 0; // number of patch merges **per-side**
+    resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC;
+    bool image_resize_pad = true; // if false, center-crop will be applied when resizing
+    std::array<uint8_t, 3> image_pad_color = {0, 0, 0};

+    // (preprocessor) for llava-uhd style models
+    std::vector<clip_image_size> image_res_candidates;
    int32_t preproc_min_tiles = 0;
    int32_t preproc_max_tiles = 0;
+    resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
+    resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
+    bool image_pad_rf = true;  // if true, refined image will be padded (e.g. llava-1.6)
+    bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6)
+    std::array<uint8_t, 3> image_pad_color_rf = {0, 0, 0}; // padding color for refined image
+    std::array<uint8_t, 3> image_pad_color_ov = {0, 0, 0}; // padding color for overview image

    float image_mean[3];
    float image_std[3];
@@ -60,8 +80,6 @@ struct clip_hparams {
    float eps = 1e-6;
    float rope_theta = 0.0;

-    std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
-    int32_t image_crop_resolution;
    std::unordered_set<int32_t> vision_feature_layer;
    int32_t attn_window_size = 0;
    int32_t n_wa_pattern = 0;