mtmd : add MERaLiON-2 multimodal audio support (#21756)

* mtmd : add MERaLiON-2 multimodal audio support Adds support for A*STAR's MERaLiON-2 audio-language model (3B and 10B) to the multimodal framework. Architecture: - Whisper large-v2 encoder for audio feature extraction - Gated MLP adaptor: ln_speech -> frame stack (x15) -> Linear+SiLU -> GLU -> out_proj - Gemma2 3B / 27B decoder The mmproj GGUF is generated via convert_hf_to_gguf.py --mmproj on the full MERaLiON-2 model directory (architecture: MERaLiON2ForConditionalGeneration). The decoder is converted separately as a standard Gemma2 model after stripping the text_decoder. weight prefix. New projector type: PROJECTOR_TYPE_MERALION Supports tasks: speech transcription (EN/ZH/MS/TA), translation, spoken QA. Model: https://huggingface.co/MERaLiON/MERaLiON-2-3B https://huggingface.co/MERaLiON/MERaLiON-2-10B * simplify comments in meralion adaptor * meralion: use format_tensor_name, ascii arrows in comments
2026-04-11 20:15:48 +08:00
parent af1127d3c4
commit 073bb2c20b
8 changed files with 103 additions and 2 deletions
@@ -11279,6 +11279,48 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
        self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])


+@ModelBase.register("MERaLiON2ForConditionalGeneration")
+class MERaLiONWhisperEncoderModel(WhisperEncoderModel):
+    has_vision_encoder = False
+    has_audio_encoder = True
+
+    def get_audio_config(self) -> dict[str, Any] | None:
+        return self.global_config.get("speech_config")
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MERALION)
+        self.gguf_writer.add_audio_stack_factor(self.global_config.get("speech_mlp_scale_factor", 15))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("text_decoder."):
+            return
+
+        if name.startswith("speech_encoder."):
+            name = name.replace("speech_encoder.", "audio_tower.")
+            yield from super().modify_tensors(data_torch, name, bid)
+            return
+
+        suffix = "." + name.rsplit(".", 1)[-1]
+
+        if name.startswith("ln_speech."):
+            yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MM_NORM_PRE, suffix=suffix), data_torch)
+            return
+
+        if name.startswith("speech_audio_adapter."):
+            if ".mlp_adapter.0." in name:
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 0, suffix=suffix), data_torch)
+            elif ".gate_proj." in name:
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 1, suffix=suffix), data_torch)
+            elif ".pool_proj." in name:
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 2, suffix=suffix), data_torch)
+            elif ".out_proj." in name:
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 3, suffix=suffix), data_torch)
+            return
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
@ModelBase.register("VoxtralForConditionalGeneration")
 class VoxtralWhisperEncoderModel(WhisperEncoderModel):
    has_vision_encoder = False # no vision encoder