ggml : add NVFP4 quantization type support (#19769)
* WIP: add NVFP4 quantization support * tests * improve NVFP4 dot product implementation performance and fix bad super call * typo * Use nvfp4 kvalues * vulkan : fix NVFP4 shader compilation by including kvalues_mxfp4 lookup table * vulcal and perf fixes * wip * Fix metal * fix vulcan * Rename threshold & fix wrong scale * Fix MOE * Shelf backend implementations (CUDA, Metal, Vulkan, arch-specific SIMD) Remove NVFP4 support from GPU backends and architecture-specific optimized dot products. These should be added in separate PRs so backend specialists can review them independently. Reverted files: - ggml-cuda: common.cuh, convert.cu, mmq.cu/cuh, mmvq.cu, vecdotq.cuh, quantize.cu/cuh, mma.cuh, ggml-cuda.cu, fattn-tile.cuh - ggml-metal: ggml-metal.metal, ggml-metal-device.cpp, ggml-metal-impl.h, ggml-metal-ops.cpp - ggml-vulkan: ggml-vulkan.cpp, all vulkan-shaders/* - ggml-cpu arch: arm/quants.c, x86/quants.c, powerpc/quants.c, s390/quants.c Core NVFP4 support (type definition, CPU fallback dot product, quantization, dequantization, conversion) is retained. * Fix arch-fallback.h: add NVFP4 generic fallback for all platforms After shelving backend-specific SIMD implementations, the generic CPU dot product needs to be aliased on ARM, x86, PowerPC, and s390 platforms that previously relied on arch-specific versions. * quantize: add NVFP4 as a quantization type option * Fix ggml_fp32_to_ue4m3: handle subnormal values Previously, values with ue4m3_exp <= 0 were clamped to 0, causing all small scales to underflow. This made NVFP4 quantization via llama-quantize produce garbage (PPL = 5.8M) since typical transformer weights have amax/6.0 in the range 0.001-0.01, which falls in the UE4M3 subnormal range. Now subnormals are properly encoded as man * 2^-9 (exp=0, man=1..7), matching the decode path in ggml_ue4m3_to_fp32. Result: NVFP4 requantization now produces PPL = 15.25 (vs F16 = 14.33), comparable to Q4_1 (PPL = 15.81) at slightly lower BPW (4.70 vs 5.15). * Restore ARM NEON NVFP4 dot product implementation Restores the optimized ggml_vec_dot_nvfp4_q8_0 for ARM NEON using vqtbl1q_s8 lookup and ggml_vdotq_s32 dot products. tg128 performance: 4.37 t/s (generic) -> 13.66 t/s (NEON) = 3.1x speedup * Optimize ARM NEON NVFP4 dot product: LUT + vpaddq + vfmaq - Add ue4m3_scale_lut[128] to ggml-common.h replacing branch-heavy ggml_ue4m3_to_fp32() in the hot loop - Use vpaddq_s32 for pairwise int32 reduction instead of vaddvq_s32 - Accumulate with vfmaq_f32 into float32x4_t vector accumulators tg128: 8.1 -> 31.0 t/s (3.8x speedup, 77% of Q4_1 speed) * ARM NEON NVFP4: rearrange q8 to match nibble layout Alternative approach: rearrange q8 data to match the NVFP4 lo/hi nibble layout instead of rearranging the looked-up NVFP4 values. Eliminates vcombine_s8(vget_low, vget_low) shuffles. Performance is equivalent (~18.5 t/s) - the bottleneck is the 2x block overhead from QK=16 vs QK=32, not the shuffle instructions. * CPU only backend 64 super-block layout * cleanup * Remove unused LUT * int * exclude NVFP4 from unsupported ops in metal build * remove quantization for now * store scales as native UE4M3, preserve original model bits when possible * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * correct comment * format * reduce duplication and cleanup * Address comments * move detection to prepare_tensors * Use math instead of const * Move * fix comment * Shelf quantize tests * Rebase and move check * cleanup * lint * Update gguf-py/gguf/scripts/gguf_convert_endian.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Use fallback quant config * Simplify Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * organize * Refactor * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * add quantize_nvfp4 (required for test_quants.py) * add quantize_nvfp4 (required for test_quants.py) * add quantize_nvfp4 (required for test_quants.py) * fix return type --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
@@ -3784,6 +3784,7 @@ class GGMLQuantizationType(IntEnum):
|
||||
TQ1_0 = 34
|
||||
TQ2_0 = 35
|
||||
MXFP4 = 39
|
||||
NVFP4 = 40
|
||||
|
||||
|
||||
class ExpertGatingFuncType(IntEnum):
|
||||
@@ -3941,6 +3942,7 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
||||
GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
|
||||
GGMLQuantizationType.TQ2_0: (256, 2 + 64),
|
||||
GGMLQuantizationType.MXFP4: (32, 1 + 16),
|
||||
GGMLQuantizationType.NVFP4: (64, 4 + 32),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -139,10 +139,13 @@ class GGUFWriter:
|
||||
size = prod(shape)
|
||||
|
||||
if "_exps." in name:
|
||||
expert_count = shape[-2 if ".bias" in name else -3]
|
||||
expert_params += (size // expert_count)
|
||||
expert_sum += expert_count
|
||||
n_expert_tensors += 1
|
||||
if len(shape) >= 3:
|
||||
expert_count = shape[-2 if ".bias" in name else -3]
|
||||
expert_params += (size // expert_count)
|
||||
expert_sum += expert_count
|
||||
n_expert_tensors += 1
|
||||
else:
|
||||
shared_params += size
|
||||
else:
|
||||
shared_params += size
|
||||
|
||||
|
||||
@@ -704,6 +704,65 @@ class MXFP4(__Quant, qtype=GGMLQuantizationType.MXFP4):
|
||||
return (d * qs.astype(np.float32))
|
||||
|
||||
|
||||
class NVFP4(__Quant, qtype=GGMLQuantizationType.NVFP4):
|
||||
# E2M1 values doubled (kvalues_mxfp4 convention)
|
||||
kvalues = (0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12)
|
||||
|
||||
@staticmethod
|
||||
def ue4m3_to_fp32(x: np.ndarray) -> np.ndarray:
|
||||
"""Decode unsigned E4M3 (bias=7) to float, with 0.5 factor for kvalues convention."""
|
||||
exp = (x >> 3).astype(np.int32) & 0xF
|
||||
man = (x & 0x7).astype(np.float32)
|
||||
raw = np.where(
|
||||
exp == 0,
|
||||
man * 2**-9,
|
||||
(1.0 + man / 8.0) * (2.0 ** (exp.astype(np.float32) - 7)))
|
||||
return np.where((x == 0) | (x == 0x7F), 0.0, raw * 0.5)
|
||||
|
||||
@staticmethod
|
||||
def fp32_to_ue4m3(x: np.ndarray) -> np.ndarray:
|
||||
"""Vectorized float32 to unsigned E4M3, matching ggml_fp32_to_ue4m3 in C."""
|
||||
x = np.clip(x, 0.0, 448.0).astype(np.float32)
|
||||
bits = x.view(np.uint32)
|
||||
fp32_exp = ((bits >> 23) & 0xFF).astype(np.int32) - 127
|
||||
fp32_man = ((bits >> 20) & 0x7).astype(np.int32)
|
||||
ue4m3_exp = fp32_exp + 7
|
||||
|
||||
# Subnormal
|
||||
sub_man = np.clip((x * 512.0 + 0.5).astype(np.int32), 0, 7)
|
||||
sub_result = np.where(sub_man >= 1, sub_man, 0).astype(np.uint8)
|
||||
|
||||
# Normal with rounding
|
||||
round_bit = ((bits >> 19) & 1).astype(np.int32)
|
||||
man = fp32_man + round_bit
|
||||
exp = ue4m3_exp.copy()
|
||||
overflow = man > 7
|
||||
man = np.where(overflow, 0, man)
|
||||
exp = np.where(overflow, exp + 1, exp)
|
||||
normal_result = np.where(exp >= 15, np.uint8(0x7E), ((exp << 3) | man).astype(np.uint8))
|
||||
|
||||
return np.where(x <= 0.0, np.uint8(0),
|
||||
np.where(ue4m3_exp <= 0, sub_result,
|
||||
np.where(ue4m3_exp >= 15, np.uint8(0x7E), normal_result)))
|
||||
|
||||
@classmethod
|
||||
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
n_super = blocks.shape[0]
|
||||
|
||||
d_bytes, qs = np.hsplit(blocks, [4])
|
||||
d = cls.ue4m3_to_fp32(d_bytes).reshape(n_super, 4, 1) # (n_super, 4, 1)
|
||||
|
||||
qs = qs.reshape(n_super, 4, 8)
|
||||
lo = (qs & np.uint8(0x0F)).view(np.int8)
|
||||
hi = (qs >> np.uint8(4)).view(np.int8)
|
||||
vals = np.concatenate([lo, hi], axis=-1) # (n_super, 4, 16)
|
||||
|
||||
kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16)
|
||||
vals = np.take_along_axis(kvalues, vals, axis=-1)
|
||||
|
||||
return (d * vals.astype(np.float32)).reshape(n_super, 64)
|
||||
|
||||
|
||||
class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
|
||||
ksigns: bytes = (
|
||||
b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
|
||||
|
||||
@@ -65,6 +65,7 @@ byteswap_tensors = {
|
||||
gguf.GGMLQuantizationType.Q4_K: byteswap_q4_k,
|
||||
gguf.GGMLQuantizationType.Q6_K: byteswap_q6_k,
|
||||
gguf.GGMLQuantizationType.MXFP4: byteswap_noop,
|
||||
gguf.GGMLQuantizationType.NVFP4: byteswap_noop,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -68,6 +68,7 @@ class GGMLQuants:
|
||||
"q2_K", "q3_K", "q4_K", "q5_K", "q6_K",
|
||||
"tq1_0", "tq2_0",
|
||||
"mxfp4",
|
||||
"nvfp4",
|
||||
"iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m",
|
||||
"iq4_nl", "iq4_xs",
|
||||
):
|
||||
|
||||
Reference in New Issue
Block a user