mtmd: support dots.ocr (#17575)
* convert gguf * clip impl * fix conversion * wip * corrections * update docs * add gguf to test script
This commit is contained in:
+41
-2
@@ -3777,7 +3777,14 @@ class QwenModel(TextModel):
|
||||
self._set_vocab_qwen()
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration")
|
||||
@ModelBase.register(
|
||||
"Qwen2Model",
|
||||
"Qwen2ForCausalLM",
|
||||
"Qwen2AudioForConditionalGeneration",
|
||||
"KORMoForCausalLM",
|
||||
"AudioFlamingo3ForConditionalGeneration",
|
||||
"DotsOCRForCausalLM",
|
||||
)
|
||||
class Qwen2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN2
|
||||
|
||||
@@ -3798,7 +3805,8 @@ class Qwen2Model(TextModel):
|
||||
name = name.replace("language_model.", "") # for InternVL
|
||||
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
|
||||
or name.startswith("vision_model") or name.startswith("audio_tower") \
|
||||
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
|
||||
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") \
|
||||
or name.startswith("vision_tower."):
|
||||
# skip vision and audio tensors
|
||||
return
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
@@ -12819,6 +12827,37 @@ class SolarOpenModel(Glm4MoeModel):
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
|
||||
@ModelBase.register("DotsOCRForCausalLM")
|
||||
class DotsOCRVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
self.hparams_vision["image_size"] = 0 # dynamic resolution
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DOTSOCR)
|
||||
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
|
||||
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["rms_norm_eps"]))
|
||||
self.gguf_writer.add_vision_projector_scale_factor(self.find_vparam(["spatial_merge_size"]))
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("vision_tower."):
|
||||
if "vision_tower.blocks." in name and ".mlp." in name:
|
||||
# note: to avoid naming conflicts in tensor_mapping.py, we need to handle FFN renaming here
|
||||
# x = F.silu(self.fc1(x)) * self.fc3(x)
|
||||
# x = self.fc2(x)
|
||||
# fc1 -> gate, fc2 -> down, fc3 -> up
|
||||
# mapping original names to Qwen2.5 naming scheme
|
||||
name = name.replace("vision_tower.blocks.", "visual.blocks.")
|
||||
name = name.replace(".fc1", ".gate_proj")
|
||||
name = name.replace(".fc2", ".down_proj")
|
||||
name = name.replace(".fc3", ".up_proj")
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user