From d13540becdbccf4829379da67c85be680aa762a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 27 Apr 2026 08:45:01 +0200 Subject: [PATCH] convert : remove input_scale for dequantized fp8 modelopt (#22356) --- convert_hf_to_gguf.py | 51 ++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 93d5509e6..bf8af863a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -272,6 +272,22 @@ class ModelBase: return tensors + @staticmethod + def _scale_is_trivial(scale: Tensor) -> bool: + return scale.numel() <= 1 and abs(float(scale.float().sum()) - 1.0) < 1e-6 + + def _write_scale_tensor(self, scale_name: str, scale: Tensor): + if not self._scale_is_trivial(scale): + scale_f32 = scale.float().numpy().flatten() + logger.info(f" + {scale_name} (per-tensor scale, shape [{scale_f32.size}])") + self.gguf_writer.add_tensor(scale_name, scale_f32) + + def _write_scales_tensor(self, scale_name: str, scales: list[float]): + if not np.allclose(scales, 1.0, atol=1e-6): + scale_vals = np.array(scales, dtype=np.float32) + logger.info(f" + {scale_name} (per-expert scale, shape [{len(scales)}])") + self.gguf_writer.add_tensor(scale_name, scale_vals) + def dequant_model(self): # If all quantized tensors were already handled (e.g. pure NVFP4), skip if self._is_nvfp4 and not any(k.endswith((".weight_scale", ".weight_scale_inv")) for k in self.model_tensors): @@ -494,7 +510,7 @@ class ModelBase: s = self.model_tensors[name] self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s(), None) tensors_to_remove.append(name) - if name.endswith((".k_scale", ".v_scale")): + if name.endswith((".input_scale", ".k_scale", ".v_scale")): tensors_to_remove.append(name) elif quant_method is not None: raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}") @@ -602,10 +618,6 @@ class ModelBase: raw = np.concatenate([d_grouped, qs_grouped], axis=-1).reshape(out_features, n_super * 36) return raw, [out_features, n_super * 64] - @staticmethod - def _nvfp4_scale2_is_trivial(scale2: Tensor) -> bool: - return scale2.numel() <= 1 and abs(float(scale2.float().sum()) - 1.0) < 1e-6 - def _repack_nvfp4(self, name: str, weight: Tensor, scale: Tensor, scale2: Tensor, input_scale: Tensor): if "language_model." in name: name = name.replace("language_model.", "") @@ -616,19 +628,8 @@ class ModelBase: logger.info(f"Repacked {new_name} with shape {shape} and quantization NVFP4") self.gguf_writer.add_tensor(new_name, raw, raw_dtype=gguf.GGMLQuantizationType.NVFP4) - # Emit per-tensor scale2 as a separate F32 tensor when non-trivial - if not self._nvfp4_scale2_is_trivial(scale2): - scale2_f32 = scale2.float().numpy().flatten() - scale_name = new_name.replace(".weight", ".scale") - logger.info(f" + {scale_name} (per-tensor NVFP4 scale2, shape [{scale2_f32.size}])") - self.gguf_writer.add_tensor(scale_name, scale2_f32) - - # Emit per-tensor input_scale as a separate F32 tensor when non-trivial - if not self._nvfp4_scale2_is_trivial(input_scale): - input_scale_f32 = input_scale.float().numpy().flatten() - input_scale_name = new_name.replace(".weight", ".input_scale") - logger.info(f" + {input_scale_name} (per-tensor NVFP4 input_scale, shape [{input_scale_f32.size}])") - self.gguf_writer.add_tensor(input_scale_name, input_scale_f32) + self._write_scale_tensor(new_name.replace(".weight", ".scale"), scale2) + self._write_scale_tensor(new_name.replace(".weight", ".input_scale"), input_scale) def _generate_nvfp4_tensors(self): # Per-layer expert merging to avoid holding all experts in memory @@ -719,21 +720,11 @@ class ModelBase: logger.info(f"Repacked {new_name} with shape [{len(experts)}, {shape[0]}, {shape[1]}] and quantization NVFP4") self.gguf_writer.add_tensor(new_name, merged, raw_dtype=gguf.GGMLQuantizationType.NVFP4) - # Emit per-expert scale2 tensor if any expert has non-trivial scale2 scales.sort(key=lambda x: x[0]) - scale_vals = np.array([s[1] for s in scales], dtype=np.float32) - if not np.allclose(scale_vals, 1.0, atol=1e-6): - scale_name = new_name.replace(".weight", ".scale") - logger.info(f" + {scale_name} (per-expert NVFP4 scale2, shape [{len(scales)}])") - self.gguf_writer.add_tensor(scale_name, scale_vals) + self._write_scales_tensor(new_name.replace(".weight", ".scale"), [s[1] for s in scales]) - # Emit per-expert input_scale tensor if any expert has non-trivial input_scale input_scales.sort(key=lambda x: x[0]) - input_scale_vals = np.array([s[1] for s in input_scales], dtype=np.float32) - if not np.allclose(input_scale_vals, 1.0, atol=1e-6): - input_scale_name = new_name.replace(".weight", ".input_scale") - logger.info(f" + {input_scale_name} (per-expert NVFP4 input_scale, shape [{len(input_scales)}])") - self.gguf_writer.add_tensor(input_scale_name, input_scale_vals) + self._write_scales_tensor(new_name.replace(".weight", ".input_scale"), [s[1] for s in input_scales]) del experts, merged