mtmd: Add dynamic high-resolution image preprocessing for InternVL model (#20847)

* added support for internvl's dynamic high-resolution (Qianfan-OCR needed)

* add min/max dynamic patch to gguf meta

* clean up

* simplified handling min/max dynamic patch

* reuse llava_uhd logic for slice images

* provide default values for older models

* flake8

* prevent writing 0 value to gguf

* remove duplicated resolution candidates with a better algorithm

* fix indentation

* format

* add protection from divide by zero

* change to 0 to be safe

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
bssrdf
2026-03-22 20:06:30 -04:00
committed by GitHub
parent d3ac030a5d
commit ec2b787ebe
7 changed files with 84 additions and 4 deletions
+53 -3
View File
@@ -1138,6 +1138,16 @@ struct clip_model_loader {
}
} break;
case PROJECTOR_TYPE_INTERNVL:
{
// older version of internvl doesn't have min/max tiles, we need to provide default values for them to avoid issues
hparams.preproc_min_tiles = 1;
hparams.preproc_max_tiles = 12;
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
get_u32(KEY_PREPROC_MIN_TILES, hparams.preproc_min_tiles, false);
get_u32(KEY_PREPROC_MAX_TILES, hparams.preproc_max_tiles, false);
GGML_ASSERT(hparams.preproc_min_tiles <= hparams.preproc_max_tiles && hparams.preproc_max_tiles < INT32_MAX);
set_internvl_dhr_res_candidates(model);
} break;
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
{
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
@@ -2188,6 +2198,27 @@ struct clip_model_loader {
}
}
}
static void set_internvl_dhr_res_candidates(clip_model & model) {
auto & hparams = model.hparams;
int min_num = hparams.preproc_min_tiles;
int max_num = hparams.preproc_max_tiles;
if (min_num < 1) {
return; // avoid divide by 0
}
for (int a = min_num; a <= max_num; ++a) {
int b_lo = (min_num + a - 1) / a;
int b_hi = max_num / a;
b_lo = std::max(b_lo, min_num);
b_hi = std::min(b_hi, max_num);
for (int b = b_lo; b <= b_hi; ++b) {
hparams.image_res_candidates.push_back(clip_image_size {
a*hparams.image_size,
b*hparams.image_size,
});
}
}
}
};
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
@@ -2734,17 +2765,22 @@ struct llava_uhd {
return res;
}
static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst, bool overview_first = true) {
std::vector<clip_image_u8_ptr> output;
// resize to overview size
clip_image_u8_ptr resized_img(clip_image_u8_init());
img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview,
inst.padding_overview, inst.pad_color_overview);
output.push_back(std::move(resized_img));
if (overview_first) {
output.push_back(std::move(resized_img));
}
if (inst.slices.empty()) {
// no slices, just return the resized image
if (!overview_first) {
output.push_back(std::move(resized_img));
}
return output;
}
@@ -2765,6 +2801,10 @@ struct llava_uhd {
output.push_back(std::move(img_slice));
}
if (!overview_first) {
output.push_back(std::move(resized_img));
}
return output;
}
@@ -3149,10 +3189,20 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
res_imgs->grid_x = instructions.grid_size.width;
res_imgs->grid_y = instructions.grid_size.height;
} break;
case PROJECTOR_TYPE_INTERNVL: // support dynamic high-resolution
{
GGML_ASSERT(!params.image_res_candidates.empty());
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst, false);
for (size_t i = 0; i < imgs.size(); ++i) {
clip_image_f32_ptr res(clip_image_f32_init());
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
res_imgs->entries.push_back(std::move(res));
}
} break;
case PROJECTOR_TYPE_GLM_EDGE:
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
{
clip_image_u8 resized_image;