mtmd: Add dynamic high-resolution image preprocessing for InternVL model (#20847)
* added support for internvl's dynamic high-resolution (Qianfan-OCR needed) * add min/max dynamic patch to gguf meta * clean up * simplified handling min/max dynamic patch * reuse llava_uhd logic for slice images * provide default values for older models * flake8 * prevent writing 0 value to gguf * remove duplicated resolution candidates with a better algorithm * fix indentation * format * add protection from divide by zero * change to 0 to be safe --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
@@ -301,6 +301,8 @@ class Keys:
|
||||
IMAGE_SIZE = "clip.vision.image_size"
|
||||
IMAGE_MIN_PIXELS = "clip.vision.image_min_pixels"
|
||||
IMAGE_MAX_PIXELS = "clip.vision.image_max_pixels"
|
||||
PREPROC_MIN_TILES = "clip.vision.preproc_min_tiles"
|
||||
PREPROC_MAX_TILES = "clip.vision.preproc_max_tiles"
|
||||
PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
|
||||
PATCH_SIZE = "clip.vision.patch_size"
|
||||
EMBEDDING_LENGTH = "clip.vision.embedding_length"
|
||||
|
||||
@@ -1156,6 +1156,12 @@ class GGUFWriter:
|
||||
def add_vision_min_pixels(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.IMAGE_MIN_PIXELS, value)
|
||||
|
||||
def add_vision_preproc_max_tiles(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.PREPROC_MAX_TILES, value)
|
||||
|
||||
def add_vision_preproc_min_tiles(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.PREPROC_MIN_TILES, value)
|
||||
|
||||
def add_vision_preproc_image_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user