model: Add PaddleOCR-VL model support (#18825)
* support PaddleOCR-VL * clip: update PaddleOCR model loader parameters to prevent OOM during warmup * [update] add paddleocr vl text model instead of ernie4.5 * [update] restore change of minicpmv * [update] format * [update] format * [update] positions and patch merge permute * [update] mtmd_decode_use_mrope for paddleocr * [update] image min/max pixels * [update] remove set_limit_image_tokens * upate: preprocess without padding * clean up * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
@@ -3733,6 +3733,13 @@ class Ernie4_5Model(TextModel):
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
if "add_prefix_space" in tokenizer_config_json:
|
||||
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
@@ -3742,6 +3749,10 @@ class Ernie4_5Model(TextModel):
|
||||
if (head_dim := self.hparams.get("head_dim")) is None:
|
||||
head_dim = self.hparams["hidden_size"] // num_heads
|
||||
|
||||
if "mlp_AR" in name or "vision_model" in name:
|
||||
# skip vision model and projector tensors
|
||||
return
|
||||
|
||||
if "ernie." in name:
|
||||
name = name.replace("ernie.", "model.")
|
||||
# split the qkv weights
|
||||
@@ -3851,6 +3862,48 @@ class Ernie4_5MoeModel(Ernie4_5Model):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("PaddleOCRVLForConditionalGeneration")
|
||||
class PaddleOCRModel(Ernie4_5Model):
|
||||
model_arch = gguf.MODEL_ARCH.PADDLEOCR
|
||||
|
||||
|
||||
@ModelBase.register("PaddleOCRVisionModel")
|
||||
class PaddleOCRVisionModel(MmprojModel):
|
||||
# PaddleOCR-VL uses a modified version of Siglip
|
||||
min_pixels: int = 0
|
||||
max_pixels: int = 0
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
self.min_pixels = self.preprocessor_config["min_pixels"]
|
||||
self.max_pixels = self.preprocessor_config["max_pixels"]
|
||||
self.hparams_vision["image_size"] = int(math.sqrt(self.max_pixels))
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
assert self.hparams_vision is not None
|
||||
hparams = self.hparams_vision
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PADDLEOCR)
|
||||
self.gguf_writer.add_vision_max_pixels(self.max_pixels)
|
||||
self.gguf_writer.add_vision_min_pixels(self.min_pixels)
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-6))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
name = name.replace("visual.", "model.")
|
||||
|
||||
if "vision_model" in name or "mlp_AR" in name:
|
||||
if "packing_position_embedding" in name:
|
||||
return # unused
|
||||
elif "vision_model.head" in name:
|
||||
# we don't yet support image embeddings for this model
|
||||
return
|
||||
else:
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
return # skip other tensors
|
||||
|
||||
|
||||
@ModelBase.register(
|
||||
"Qwen2VLModel",
|
||||
"Qwen2VLForConditionalGeneration",
|
||||
|
||||
Reference in New Issue
Block a user