model : add glm-asr support (#17901)
* [model] add glm-asr support * fix format for ci * fix convert format for ci * update glm_asr convert script & use build_ffn for glm_asr clip & use build_stack for padding and review * check root architecture for convert hf script * fix conficlt with upstream * fix convert script for glm asr & format clip-impl * format * restore hparams text * improved conversion --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
+84
-2
@@ -713,6 +713,9 @@ class ModelBase:
|
||||
if "llm_config" in config:
|
||||
# rename for InternVL
|
||||
config["text_config"] = config["llm_config"]
|
||||
if "lm_config" in config:
|
||||
# rename for GlmASR
|
||||
config["text_config"] = config["lm_config"]
|
||||
if "thinker_config" in config:
|
||||
# rename for Qwen2.5-Omni
|
||||
config["text_config"] = config["thinker_config"]["text_config"]
|
||||
@@ -1529,6 +1532,21 @@ class TextModel(ModelBase):
|
||||
raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
|
||||
self.gguf_writer.add_pooling_type(pooling_type)
|
||||
|
||||
def _set_vocab_glmedge(self):
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
||||
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_interns1(self):
|
||||
tokens: list[str] = []
|
||||
toktypes: list[int] = []
|
||||
@@ -1658,7 +1676,7 @@ class MmprojModel(ModelBase):
|
||||
preprocessor_config: dict[str, Any]
|
||||
global_config: dict[str, Any]
|
||||
|
||||
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
|
||||
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"]
|
||||
|
||||
has_vision_encoder: bool = True # by default
|
||||
has_audio_encoder: bool = False
|
||||
@@ -1734,7 +1752,8 @@ class MmprojModel(ModelBase):
|
||||
return self.global_config.get(config_name)
|
||||
|
||||
def get_audio_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config.get("audio_config")
|
||||
mm_config_key = "whisper_config" if "whisper_config" in self.hparams else "audio_config"
|
||||
return self.global_config.get(mm_config_key)
|
||||
|
||||
def set_type(self):
|
||||
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
|
||||
@@ -2372,8 +2391,13 @@ class LlamaModel(TextModel):
|
||||
# fix for SmolVLM2, missing `num_attention_heads` in config.json
|
||||
if self.hf_arch == "VLlama3ForCausalLM":
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
||||
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
||||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||
|
||||
def set_vocab(self):
|
||||
if self.origin_hf_arch == "GlmasrModel":
|
||||
return self._set_vocab_glmedge()
|
||||
|
||||
if self.is_mistral_format:
|
||||
return self._set_vocab_mistral()
|
||||
|
||||
@@ -2444,6 +2468,7 @@ class LlamaModel(TextModel):
|
||||
"vision_language_adapter.",
|
||||
"patch_merger.",
|
||||
"pre_mm_projector_norm",
|
||||
"audio_encoder.",
|
||||
]
|
||||
|
||||
is_multimodal_tensor = "vision_tower" in name \
|
||||
@@ -8846,6 +8871,63 @@ class UltravoxModel(TextModel):
|
||||
raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")
|
||||
|
||||
|
||||
@ModelBase.register("GlmasrModel")
|
||||
class GlmASRWhisperEncoderModel(MmprojModel):
|
||||
has_vision_encoder = False
|
||||
has_audio_encoder = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
|
||||
self.hparams["hidden_size"] = self.hparams["d_model"]
|
||||
self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
|
||||
self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLMA)
|
||||
self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
|
||||
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
|
||||
self.gguf_writer.add_audio_stack_factor(self.global_config["merge_factor"])
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
if ".conv" in name and ".weight" in name:
|
||||
return gguf.GGMLQuantizationType.F16
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
if name.startswith("model.") or name.startswith("lm_head."):
|
||||
# skip language model tensors
|
||||
return []
|
||||
|
||||
if name.startswith("audio_encoder.whisper."):
|
||||
name = name.replace("audio_encoder.whisper.","audio_tower.")
|
||||
if "audio_encoder.layer_norm." in name or "audio_encoder.proj." in name:
|
||||
name = name.replace("audio_encoder.", "audio_encoder.adapting.")
|
||||
|
||||
if name.startswith("audio_encoder.audio_bos_eos_token."):
|
||||
return [(self.map_tensor_name("model.vision.boi"), data_torch[0]), (self.map_tensor_name("model.vision.eoi"), data_torch[1])]
|
||||
|
||||
if name.startswith("audio_encoder.adapting."):
|
||||
name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.")
|
||||
if ".layer_norm." in name:
|
||||
name = name.replace(".layer_norm.", ".ln_pre.")
|
||||
if ".0." in name:
|
||||
name = name.replace(".0.", ".linear_1.")
|
||||
if ".2." in name:
|
||||
name = name.replace(".2.", ".linear_2.")
|
||||
if ".proj." in name:
|
||||
return []
|
||||
|
||||
if "conv1.bias" in name or "conv2.bias" in name:
|
||||
# transpose conv1 and conv2 bias
|
||||
data_torch = data_torch.unsqueeze(-1)
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2AudioForConditionalGeneration")
|
||||
class WhisperEncoderModel(MmprojModel):
|
||||
has_vision_encoder = False # no vision encoder
|
||||
|
||||
Reference in New Issue
Block a user