mtmd : add MERaLiON-2 multimodal audio support (#21756)
* mtmd : add MERaLiON-2 multimodal audio support Adds support for A*STAR's MERaLiON-2 audio-language model (3B and 10B) to the multimodal framework. Architecture: - Whisper large-v2 encoder for audio feature extraction - Gated MLP adaptor: ln_speech -> frame stack (x15) -> Linear+SiLU -> GLU -> out_proj - Gemma2 3B / 27B decoder The mmproj GGUF is generated via convert_hf_to_gguf.py --mmproj on the full MERaLiON-2 model directory (architecture: MERaLiON2ForConditionalGeneration). The decoder is converted separately as a standard Gemma2 model after stripping the text_decoder. weight prefix. New projector type: PROJECTOR_TYPE_MERALION Supports tasks: speech transcription (EN/ZH/MS/TA), translation, spoken QA. Model: https://huggingface.co/MERaLiON/MERaLiON-2-3B https://huggingface.co/MERaLiON/MERaLiON-2-10B * simplify comments in meralion adaptor * meralion: use format_tensor_name, ascii arrows in comments
This commit is contained in:
@@ -11279,6 +11279,48 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
|
||||
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
|
||||
|
||||
|
||||
@ModelBase.register("MERaLiON2ForConditionalGeneration")
|
||||
class MERaLiONWhisperEncoderModel(WhisperEncoderModel):
|
||||
has_vision_encoder = False
|
||||
has_audio_encoder = True
|
||||
|
||||
def get_audio_config(self) -> dict[str, Any] | None:
|
||||
return self.global_config.get("speech_config")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MERALION)
|
||||
self.gguf_writer.add_audio_stack_factor(self.global_config.get("speech_mlp_scale_factor", 15))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("text_decoder."):
|
||||
return
|
||||
|
||||
if name.startswith("speech_encoder."):
|
||||
name = name.replace("speech_encoder.", "audio_tower.")
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
return
|
||||
|
||||
suffix = "." + name.rsplit(".", 1)[-1]
|
||||
|
||||
if name.startswith("ln_speech."):
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MM_NORM_PRE, suffix=suffix), data_torch)
|
||||
return
|
||||
|
||||
if name.startswith("speech_audio_adapter."):
|
||||
if ".mlp_adapter.0." in name:
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 0, suffix=suffix), data_torch)
|
||||
elif ".gate_proj." in name:
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 1, suffix=suffix), data_torch)
|
||||
elif ".pool_proj." in name:
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 2, suffix=suffix), data_torch)
|
||||
elif ".out_proj." in name:
|
||||
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 3, suffix=suffix), data_torch)
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("VoxtralForConditionalGeneration")
|
||||
class VoxtralWhisperEncoderModel(WhisperEncoderModel):
|
||||
has_vision_encoder = False # no vision encoder
|
||||
|
||||
Reference in New Issue
Block a user