diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index bf8af863a..90c2b7094 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -728,6 +728,9 @@ class ModelBase:
 
         del experts, merged
 
+    def _needs_nvfp4_processing(self) -> bool:
+        return True
+
     def prepare_tensors(self):
         # detect NVFP4 quantization (ModelOpt format)
         quant_algo = (self.hparams.get("quantization_config") or {}).get("quant_algo")
@@ -758,7 +761,7 @@ class ModelBase:
         # NVFP4 weights are repacked and written directly to gguf_writer.
         # This must run before dequant_model so NVFP4 tensors are removed
         # from model_tensors, leaving only non-NVFP4 (e.g. FP8) for dequant.
-        if self._is_nvfp4:
+        if self._is_nvfp4 and self._needs_nvfp4_processing():
             self._generate_nvfp4_tensors()
 
         self.dequant_model()
@@ -2190,6 +2193,10 @@ class MmprojModel(ModelBase):
                 # merge configs
                 self.preprocessor_config = {**self.preprocessor_config, **cfg}
 
+    def _needs_nvfp4_processing(self) -> bool:
+        # nvfp4 quantization applies to the text model only.
+        return False
+
     def get_vision_config(self) -> dict[str, Any] | None:
         config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
         return self.global_config.get(config_name)
@@ -4450,6 +4457,12 @@ class NemotronNanoV2VLModel(MmprojModel):
         }
         return vision_config
 
+    def dequant_model(self):
+        if self._is_nvfp4:
+            # Skip nvfp4 quantization for vision/audio model.
+            return
+        super().dequant_model()
+
     def set_gguf_parameters(self):
         if "image_mean" not in self.preprocessor_config:
             self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406]
@@ -4473,6 +4486,10 @@ class NemotronNanoV2VLModel(MmprojModel):
         if "input_conditioner" in name:
             return
 
+        # mtmd does not support video yet so skip tensors related to video.
+        if "radio_model.model.patch_generator.video_embedder" in name:
+            return
+
         # RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it
         if "patch_generator.pos_embed" in name:
             if not name.endswith(".weight"):
@@ -10820,7 +10837,11 @@ class NemotronHModel(GraniteHybridModel):
         # uses self.model_arch to build the tensor name map, and all MoE-specific
         # mappings would be missed if it were called with the default non-MoE arch.
         hparams = ModelBase.load_hparams(args[0], self.is_mistral_format)
-        if "num_experts_per_tok" in hparams:
+        has_moe_params = (
+            "num_experts_per_tok" in hparams
+            or (isinstance(hparams.get("llm_config"), dict) and "num_experts_per_tok" in hparams["llm_config"])
+        )
+        if has_moe_params:
             self.model_arch = gguf.MODEL_ARCH.NEMOTRON_H_MOE
             self.is_moe = True
 
@@ -10967,6 +10988,11 @@ class NemotronHModel(GraniteHybridModel):
         if name.startswith(("vision_model.", "mlp1.")):
             return
 
+        if name.startswith(("sound_encoder.")):
+            return
+        if name.startswith(("sound_projection.")):
+            return
+
         # Strip language_model. prefix for VLM models (e.g., Nemotron Nano 12B v2 VL)
         if name.startswith("language_model."):
             name = name[len("language_model."):]