mtmd: Add dynamic high-resolution image preprocessing for InternVL model (#20847)

* added support for internvl's dynamic high-resolution (Qianfan-OCR needed)

* add min/max dynamic patch to gguf meta

* clean up

* simplified handling min/max dynamic patch

* reuse llava_uhd logic for slice images

* provide default values for older models

* flake8

* prevent writing 0 value to gguf

* remove duplicated resolution candidates with a better algorithm

* fix indentation

* format

* add protection from divide by zero

* change to 0 to be safe

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
bssrdf
2026-03-22 20:06:30 -04:00
committed by GitHub
parent d3ac030a5d
commit ec2b787ebe
7 changed files with 84 additions and 4 deletions
+2
View File
@@ -301,6 +301,8 @@ class Keys:
IMAGE_SIZE = "clip.vision.image_size"
IMAGE_MIN_PIXELS = "clip.vision.image_min_pixels"
IMAGE_MAX_PIXELS = "clip.vision.image_max_pixels"
PREPROC_MIN_TILES = "clip.vision.preproc_min_tiles"
PREPROC_MAX_TILES = "clip.vision.preproc_max_tiles"
PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
PATCH_SIZE = "clip.vision.patch_size"
EMBEDDING_LENGTH = "clip.vision.embedding_length"
+6
View File
@@ -1156,6 +1156,12 @@ class GGUFWriter:
def add_vision_min_pixels(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.IMAGE_MIN_PIXELS, value)
def add_vision_preproc_max_tiles(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.PREPROC_MAX_TILES, value)
def add_vision_preproc_min_tiles(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.PREPROC_MIN_TILES, value)
def add_vision_preproc_image_size(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value)