diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 3aec1742e..10817505d 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -830,6 +830,18 @@ static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) { #endif // defined(GGML_USE_HIP) && defined(CDNA3) && defined(FP8_AVAILABLE) && HIP_VERSION >= 60200000 } +static __device__ __forceinline__ uint8_t ggml_cuda_fp32_to_ue4m3(float x) { +#if defined(BLACKWELL_MMA_AVAILABLE) // This is used for NVFP4 subblock scale quantizations only + if (!(x > 0.0f)) { + return 0; + } + const __nv_fp8_e4m3 xf(x); + return xf.__x; +#else + NO_DEVICE_CODE; // Used only for NVFP4 Scales for Activations, only for Blackwell +#endif // defined(BLACKWELL_MMA_AVAILABLE) +} + __device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) { const uint8_t sign_bit = (x < 0.0f) << 3; float ax = fabsf(x) * e; diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index b0f674635..79bb2934c 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -1015,25 +1015,35 @@ namespace ggml_cuda_mma { #endif // AMD_MFMA_AVAILABLE } - static __device__ __forceinline__ void mma_block_scaled(tile<16, 8, float> & D, - const tile<16, 8, int> & A, - const tile<8, 8, int> & B, - uint32_t a_scale, - uint32_t b_scale) { + template + static __device__ __forceinline__ void mma_block_scaled_fp4(tile<16, 8, float> & D, + const tile<16, 8, int> & A, + const tile<8, 8, int> & B, + uint32_t a_scale, + uint32_t b_scale) { #ifdef BLACKWELL_MMA_AVAILABLE const int * Axi = (const int *) A.x; const int * Bxi = (const int *) B.x; float * Dxi = (float *) D.x; - asm volatile( - "mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 " - "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, " - "%10, {0, 0}, %11, {0, 0};" - : "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3]) - : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale)); + if constexpr (type == GGML_TYPE_MXFP4) { + asm volatile( + "mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 " + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, " + "%10, {0, 0}, %11, {0, 0};" + : "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3]) + : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale)); + } else { + asm volatile( + "mma.sync.aligned.kind::mxf4nvf4.block_scale.scale_vec::4X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue4m3 " + "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, " + "%10, {0, 0}, %11, {0, 0};" + : "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3]) + : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale)); + } #else GGML_UNUSED_VARS(D, A, B, a_scale, b_scale); -#endif // BLACKWELL_MMA_AVAILABLE +#endif // BLACKWELL_MMA_AVAILABLE } static __device__ __forceinline__ void mma( diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 3f01ff5bf..e1add5e03 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -122,7 +122,7 @@ void ggml_cuda_mul_mat_q( || GGML_CUDA_CC_IS_CDNA(cc); // TODO: tighter pool buffer size vs q8 path - const bool use_native_mxfp4 = blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4; + const bool use_native_fp4 = blackwell_mma_available(cc) && (src0->type == GGML_TYPE_MXFP4 || src0->type == GGML_TYPE_NVFP4); if (!ids) { const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 + @@ -133,9 +133,9 @@ void ggml_cuda_mul_mat_q( const int64_t s11 = src1->nb[1] / ts_src1; const int64_t s12 = src1->nb[2] / ts_src1; const int64_t s13 = src1->nb[3] / ts_src1; - if (use_native_mxfp4) { + if (use_native_fp4) { static_assert(sizeof(block_fp4_mmq) == 4 * sizeof(block_q8_1)); - quantize_mmq_mxfp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, + quantize_mmq_fp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream); } else { @@ -146,10 +146,8 @@ void ggml_cuda_mul_mat_q( } // Stride depends on quantization format - const int64_t s12 = use_native_mxfp4 ? - ne11 * ne10_padded * sizeof(block_fp4_mmq) / - (8 * QK_MXFP4 * sizeof(int)) // block_fp4_mmq holds 256 values (8 blocks of 32) - : + const int64_t s12 = use_native_fp4 ? + ne11 * ne10_padded * sizeof(block_fp4_mmq) / (QK_K * sizeof(int)) : // block_fp4_mmq holds 256 values ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int)); const int64_t s13 = ne12*s12; @@ -198,8 +196,8 @@ void ggml_cuda_mul_mat_q( const int64_t s12 = src1->nb[2] / ts_src1; const int64_t s13 = src1->nb[3] / ts_src1; - if (use_native_mxfp4) { - quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13, + if (use_native_fp4) { + quantize_mmq_fp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream); } else { quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13, @@ -208,8 +206,9 @@ void ggml_cuda_mul_mat_q( CUDA_CHECK(cudaGetLastError()); } - const int64_t s12 = use_native_mxfp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (8 * QK_MXFP4 * sizeof(int)) : - ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int)); + static_assert(QK_K == 8 * QK_MXFP4, "QK_K needs to be 8 * QK_MXFP4"); + const int64_t s12 = use_native_fp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (QK_K * sizeof(int)) : + ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int)); const int64_t s13 = ne12*s12; // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid. diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 91a1b737a..edf546d8f 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -10,9 +10,9 @@ using namespace ggml_cuda_mma; #define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available. -#define MMQ_ITER_K 256 -#define MMQ_ITER_K_MXFP4_FP4 512 -#define MMQ_NWARPS 8 +#define MMQ_ITER_K 256 +#define MMQ_ITER_K_FP4 512 +#define MMQ_NWARPS 8 typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride); typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00); @@ -46,9 +46,12 @@ struct block_q8_1_mmq { int8_t qs[4*QK8_1]; // 128 values quantized to 8 bit each }; +// this struct is used for fp4 data types (currently only used for Blackwell) +// mxfp4 has block size 32, each int32 of d4 contains 2 e8m0 scales in the lower 16 bits +// nvfp4 has block size 16, each int32 of d4 contains 4 ue4m3 scales struct block_fp4_mmq { - uint32_t d4[4]; // 8 E8M0 scales (1 per 32 values), 2 packed per uint32: d4[0]={s0,s1}, d4[1]={s2,s3}, etc. - int8_t qs[4 * 32]; // 256 FP4 values packed as 4-bit pairs (2 per byte), 8 blocks of 32 values + uint32_t d4[4]; + int8_t qs[4 * 32]; // 256 FP4 values packed as 4-bit pairs (2 per byte) }; static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size"); @@ -143,10 +146,11 @@ static int get_mmq_y_host(const int cc) { static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type) { #if defined(BLACKWELL_MMA_AVAILABLE) - return type == GGML_TYPE_MXFP4 ? MMQ_ITER_K_MXFP4_FP4 : MMQ_ITER_K; -#else - return MMQ_ITER_K; +if (type == GGML_TYPE_NVFP4 || type == GGML_TYPE_MXFP4) { + return MMQ_ITER_K_FP4; +} #endif // defined(BLACKWELL_MMA_AVAILABLE) + return MMQ_ITER_K; } static constexpr __device__ int get_mmq_y_device() { @@ -213,8 +217,8 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml } #define MMQ_MMA_TILE_X_K_Q8_0 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4) -#define MMQ_MMA_TILE_X_K_FP4 (2*MMQ_TILE_NE_K + 8 + 4) // MXFP4 -#define MMQ_MMA_TILE_X_K_NVFP4 (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4) // NVFP4 +#define MMQ_MMA_TILE_X_K_FP4 (2*MMQ_TILE_NE_K + 8 + 4) // MXFP4 and NVFP4 Blackwell +#define MMQ_MMA_TILE_X_K_NVFP4 (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4) // NVFP4 Generic #define MMQ_MMA_TILE_X_K_Q8_1 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4) #define MMQ_MMA_TILE_X_K_Q2_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K + 4) #define MMQ_MMA_TILE_X_K_Q3_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4) @@ -240,7 +244,11 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0; // tile sizes are the same for Q8_1 and FP4 for blackwell case GGML_TYPE_MXFP4: return MMQ_MMA_TILE_X_K_Q8_1; +#if defined(BLACKWELL_MMA_AVAILABLE) + case GGML_TYPE_NVFP4: return MMQ_MMA_TILE_X_K_FP4; +#else case GGML_TYPE_NVFP4: return MMQ_MMA_TILE_X_K_NVFP4; +#endif // defined(BLACKWELL_MMA_AVAILABLE) case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K; case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K; case GGML_TYPE_Q4_K: return MMQ_MMA_TILE_X_K_Q8_1; @@ -934,6 +942,128 @@ static __device__ __forceinline__ void load_tiles_mxfp4_fp4(const char * __restr } } +#ifdef BLACKWELL_MMA_AVAILABLE +template +static __device__ __forceinline__ void load_tiles_nvfp4_nvfp4(const char * __restrict__ x, + int * __restrict__ x_tile, + const int kbx0, + const int i_max, + const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + constexpr int iter_k = get_iter_k(GGML_TYPE_NVFP4); + constexpr int threads_per_row = iter_k / QK_NVFP4; // each thread processes 1 block + constexpr int rows_per_warp = warp_size / threads_per_row; + + uint32_t * x_u32 = (uint32_t *) x_tile; + + const int txi = threadIdx.x; + const int kbx = txi % threads_per_row; + const int row_in_warp = txi / threads_per_row; + + const block_nvfp4 * bxi_base = (const block_nvfp4 *) x + kbx0 + kbx; + uint32_t * x_u32_scale = x_u32 + 64 + kbx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += rows_per_warp * nwarps) { + int i = i0 + threadIdx.y * rows_per_warp + row_in_warp; + + if constexpr (need_check) { + i = min(i, i_max); + } + + const block_nvfp4 * bxi = bxi_base + i * stride; + const int row_base = i * MMQ_MMA_TILE_X_K_FP4; + const int q_base = row_base + 8 * kbx; + + const uint32_t * src_qs = reinterpret_cast(bxi->qs); + +#pragma unroll + for (int sub = 0; sub < QK_NVFP4 / QK_NVFP4_SUB; ++sub) { + x_u32[q_base + 2 * sub + 0] = src_qs[2 * sub + 0]; + x_u32[q_base + 2 * sub + 1] = src_qs[2 * sub + 1]; + } + + x_u32_scale[row_base] = get_int_b4(bxi->d, 0); + } +} + +// Shared MMA kernel for MXFP4 and NVFP4 on Blackwell. +// Both quantizations encode values as e2m1 (FP4) and produce one uint32 scale per +// m16n8k64 MMA call; only the PTX kind (scale_vec::2X ue8m0 vs scale_vec::4X ue4m3) +// and the per-type stride constant differ. +template +static __device__ __forceinline__ void vec_dot_fp4_fp4_mma(const int * __restrict__ x, + const int * __restrict__ y, + float * __restrict__ sum, + const int k00) { + static_assert(type == GGML_TYPE_MXFP4 || type == GGML_TYPE_NVFP4, + "vec_dot_fp4_fp4_mma: type must be MXFP4 or NVFP4"); + + typedef tile<16, 8, int> tile_A; + typedef tile<8, 8, int> tile_B; + typedef tile<16, 8, float> tile_C; + + constexpr int stride = MMQ_MMA_TILE_X_K_FP4; + constexpr int granularity = mmq_get_granularity_device(mmq_x); + constexpr int rows_per_warp = 2 * granularity; + constexpr int ntx = rows_per_warp / tile_C::I; + constexpr int nfrags = MMQ_TILE_NE_K / tile_A::J; + + y += (threadIdx.y % ntx) * (tile_C::J * MMQ_TILE_Y_K); + + const int * x_qs = (const int *) x; + const uint32_t * x_sc = (const uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K); + const int * y_qs = (const int *) y + 4; + const uint32_t * y_sc = (const uint32_t *) y; + + // 2 threads per quad supply the packed scale register to the block_scale MMA, + // see https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling + const int tidx_A = threadIdx.x / 4 + (threadIdx.x % 2) * 8; + const int tidx_B = threadIdx.x / 4; + const int i0 = (threadIdx.y / ntx) * rows_per_warp; + + tile_A A[ntx][nfrags]; + uint32_t scaleA[ntx][nfrags]; + +#pragma unroll + for (int n = 0; n < ntx; ++n) { +#pragma unroll + for (int frag = 0; frag < nfrags; ++frag) { + const int k0 = k00 + frag * tile_A::J; + load_ldmatrix(A[n][frag], x_qs + (i0 + n * tile_A::I) * stride + k0, stride); + scaleA[n][frag] = x_sc[(i0 + n * tile_A::I + tidx_A) * stride + k0 / tile_A::J]; + } + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += ntx * tile_C::J) { + tile_B B[nfrags]; + uint32_t scaleB[nfrags]; + +#pragma unroll + for (int frag = 0; frag < nfrags; ++frag) { + const int k0 = frag * tile_B::J; + load_generic(B[frag], y_qs + j0 * MMQ_TILE_Y_K + k0, MMQ_TILE_Y_K); + scaleB[frag] = y_sc[(j0 + tidx_B) * MMQ_TILE_Y_K + frag]; + } + +#pragma unroll + for (int n = 0; n < ntx; ++n) { +#pragma unroll + for (int frag = 0; frag < nfrags; ++frag) { + tile_C C = {}; + mma_block_scaled_fp4(C, A[n][frag], B[frag], scaleA[n][frag], scaleB[frag]); +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + sum[(j0 / tile_C::J + n) * tile_C::ne + l] += C.x[l]; + } + } + } + } +} +#endif // BLACKWELL_MMA_AVAILABLE + template static __device__ __forceinline__ void load_tiles_nvfp4(const char * __restrict__ x, @@ -1163,77 +1293,6 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( #endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) } -template -static __device__ __forceinline__ void vec_dot_mxfp4_mxfp4_mma(const int * __restrict__ x, - const int * __restrict__ y, - float * __restrict__ sum, - const int k00) { - typedef tile<16, 8, int> tile_A; - typedef tile<8, 8, int> tile_B; - typedef tile<16, 8, float> tile_C; // Output is float for native scaled MMA - - constexpr int granularity = mmq_get_granularity_device(mmq_x); - constexpr int rows_per_warp = 2 * granularity; - constexpr int ntx = rows_per_warp / tile_C::I; // Number of x minitiles per warp. - - y += (threadIdx.y % ntx) * (tile_C::J * MMQ_TILE_Y_FP4_K); - - // Match layout from load_tiles_mxfp4_fp4 - const int * x_qs = (const int *) x; - const uint32_t * x_sc = (const uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K); - const int * y_qs = (const int *) y + 4; - const uint32_t * y_sc = (const uint32_t *) y; - - // tile_A has a length of 64 logical values vs. 32 values in block_mxfp4 - tile_A A[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)]; - uint32_t scaleA[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)]; - - // Block scale - // Each thread has to point to a 4 byte scale value - // https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling - - const int i0 = (threadIdx.y / ntx) * rows_per_warp; - -#pragma unroll - for (int n = 0; n < ntx; ++n) { -#pragma unroll - for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) { - const int k0 = k00 + k01; - - load_ldmatrix(A[n][k01 / (2 * QI_MXFP4)], x_qs + (i0 + n * tile_A::I) * MMQ_MMA_TILE_X_K_FP4 + k0, - MMQ_MMA_TILE_X_K_FP4); - - // based on block-scaling document, 2 threads in each quad need to supply to the scale value - const int tidx = threadIdx.x / 4 + (threadIdx.x % 2) * 8; - scaleA[n][k01 / (2 * QI_MXFP4)] = - *(x_sc + (i0 + n * tile_A::I + tidx) * MMQ_MMA_TILE_X_K_FP4 + k0 / (2 * QI_MXFP4)); - } - } - -#pragma unroll - for (int j0 = 0; j0 < mmq_x; j0 += ntx * tile_C::J) { -#pragma unroll - for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) { - tile_B B; - uint32_t scaleB; // 2xN scales - - load_generic(B, y_qs + j0 * MMQ_TILE_Y_FP4_K + k01, MMQ_TILE_Y_FP4_K); - - scaleB = y_sc[(j0 + threadIdx.x / 4) * MMQ_TILE_Y_FP4_K + k01 / (2 * QI_MXFP4)]; - -#pragma unroll - for (int n = 0; n < ntx; ++n) { - tile_C C; - - mma_block_scaled(C, A[n][k01 / (2 * QI_MXFP4)], B, scaleA[n][k01 / (2 * QI_MXFP4)], scaleB); -#pragma unroll - for (int l = 0; l < tile_C::ne; ++l) { - sum[(j0 / tile_C::J + n) * tile_C::ne + l] += C.x[l]; - } - } - } - } -} template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( @@ -3259,7 +3318,7 @@ struct mmq_type_traits { static constexpr int vdr = VDR_MXFP4_Q8_1_MMQ; #ifdef BLACKWELL_MMA_AVAILABLE static constexpr load_tiles_mmq_t load_tiles = load_tiles_mxfp4_fp4; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_mxfp4_mxfp4_mma; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_fp4_fp4_mma; #else static constexpr load_tiles_mmq_t load_tiles = load_tiles_mxfp4; static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; @@ -3270,8 +3329,13 @@ struct mmq_type_traits { template struct mmq_type_traits { static constexpr int vdr = VDR_NVFP4_Q8_1_MMQ; +#ifdef BLACKWELL_MMA_AVAILABLE + static constexpr load_tiles_mmq_t load_tiles = load_tiles_nvfp4_nvfp4; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_fp4_fp4_mma; +#else static constexpr load_tiles_mmq_t load_tiles = load_tiles_nvfp4; static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma; +#endif // BLACKWELL_MMA_AVAILABLE static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a; }; @@ -3406,7 +3470,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile( #if defined(BLACKWELL_MMA_AVAILABLE) // FP4 tile stores 8 blocks - constexpr int ne_block = (type == GGML_TYPE_MXFP4) ? 8 * QK_MXFP4 : 4 * QK8_1; + constexpr int ne_block = (type == GGML_TYPE_MXFP4 || type == GGML_TYPE_NVFP4) ? QK_K : 4 * QK8_1; #else constexpr int ne_block = 4 * QK8_1; #endif // defined(BLACKWELL_MMA_AVAILABLE) diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 8f55cace1..da48f313a 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -115,6 +115,7 @@ static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_pascal_older(gg case GGML_TYPE_IQ4_NL: return 6; case GGML_TYPE_IQ4_XS: return 5; case GGML_TYPE_MXFP4: return 4; + case GGML_TYPE_NVFP4: return 4; case GGML_TYPE_Q2_K: return 4; case GGML_TYPE_Q3_K: return 4; case GGML_TYPE_Q4_0: return 6; @@ -135,6 +136,7 @@ static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_turing_plus(ggm case GGML_TYPE_IQ3_S: return 6; case GGML_TYPE_IQ3_XXS: return 7; case GGML_TYPE_MXFP4: return 7; + case GGML_TYPE_NVFP4: return 8; case GGML_TYPE_Q2_K: return 7; case GGML_TYPE_Q3_K: return 5; default: return MMVQ_MAX_BATCH_SIZE; @@ -221,6 +223,7 @@ static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_rdna4(ggml_type case GGML_TYPE_IQ4_NL: return 7; case GGML_TYPE_IQ4_XS: return 5; case GGML_TYPE_MXFP4: return 5; + case GGML_TYPE_NVFP4: return 5; case GGML_TYPE_Q3_K: return 4; case GGML_TYPE_Q4_0: return 7; case GGML_TYPE_Q4_1: return 7; diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 4300ffc14..52f664719 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -70,6 +70,102 @@ __device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) { return static_cast(biased); } + +static __global__ void quantize_mmq_nvfp4( + const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int64_t ne1, const int64_t ne2) { +#if defined(BLACKWELL_MMA_AVAILABLE) + + const int64_t i0_base = ((int64_t) blockDim.x * blockIdx.y + threadIdx.x) * QK_NVFP4_SUB; + if (i0_base >= ne0) { + return; + } + + const int64_t i1 = blockIdx.x; + const int64_t i2 = blockIdx.z % ne2; + const int64_t i3 = blockIdx.z / ne2; + const int64_t i01 = ids ? ids[i1] : i1; + const int64_t k_block = i0_base / QK_K; + const int64_t blocks_per_col = (ne0 + QK_K - 1) / QK_K; + if (k_block >= blocks_per_col) { + return; + } + + const int64_t ib = blockIdx.z * ((int64_t) blocks_per_col * ne1) + k_block * ne1 + blockIdx.x; + block_fp4_mmq * y = (block_fp4_mmq *) vy; + block_fp4_mmq * yb = y + ib; + + const int sub = (i0_base % QK_K) / QK_NVFP4_SUB; + + float vals_raw[QK_NVFP4_SUB]; + float amax_raw = 0.0f; + const int64_t base_idx = i3 * s03 + i2 * s02 + i01 * s01; +#pragma unroll + for (int k = 0; k < QK_NVFP4_SUB; k++) { + const int64_t i00 = i0_base + k; + if (i00 < ne00) { + const float v = x[base_idx + i00]; + vals_raw[k] = v; + amax_raw = fmaxf(amax_raw, fabsf(v)); + } else { + vals_raw[k] = 0.0f; + } + } + + static constexpr int test_offsets[5] = { 0, -1, 1, -2, 2}; + const int first_fp8_code = (int) ggml_cuda_fp32_to_ue4m3(amax_raw / 6.0f); + + float best_err = FLT_MAX; + uint8_t fp8_code = 0; + float subblock_scale = 0.0f; + +#pragma unroll // Check +/- 2 to find best code to reduce NVFP4 activation loss. Negligible overhead on Blackwell. + for (int i = 0; i < 5; i++) { + const int test_code = first_fp8_code + test_offsets[i]; + if (test_code < 0 || test_code > 0x7e) { + continue; + } + const uint8_t code = (uint8_t) test_code; + const float test_scale = ggml_cuda_ue4m3_to_fp32(code); + const float test_inv_scale = test_scale > 0.0f ? 0.5f / test_scale : 0.0f; + float cur_err = 0.0f; +#pragma unroll + for (int k = 0; k < QK_NVFP4_SUB; ++k) { + const float v = vals_raw[k]; + const uint8_t q = ggml_cuda_float_to_fp4_e2m1(v, test_inv_scale); + const float err_diff = fabsf(v) - fabsf(kvalues_mxfp4[q & 0x7]) * test_scale; + cur_err = fmaf(err_diff, err_diff, cur_err); + } + + if (cur_err < best_err) { + best_err = cur_err; + fp8_code = test_code; + subblock_scale = test_scale; + } + } + + const float inv_scale = subblock_scale > 0.0f ? 0.5f / subblock_scale : 0.0f; + uint32_t q0 = 0; + uint32_t q1 = 0; +#pragma unroll // this is faster than the previous __nv_fp4x4_e2m1 + for (int k = 0; k < QK_NVFP4_SUB / 4; ++k) { + q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 0], inv_scale) << (8 * k); + q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 8], inv_scale) << (8 * k + 4); + q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 4], inv_scale) << (8 * k); + q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 12], inv_scale) << (8 * k + 4); + } + + uint32_t * yqs = reinterpret_cast(yb->qs); + yqs[2 * sub + 0] = q0; + yqs[2 * sub + 1] = q1; + reinterpret_cast(yb->d4)[sub] = fp8_code; +#else + NO_DEVICE_CODE; // This is for Blackwell NVFP4 activations only. +#endif // defined(BLACKWELL_MMA_AVAILABLE) + +} + // quantize values in the format mxfp4 is stored which is interleaved nibbles // i.e. a block a0-a31 is represented as a0a16,a1a17 ...a15a31 static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x, @@ -316,28 +412,32 @@ void quantize_mmq_q8_1_cuda( } } -void quantize_mmq_mxfp4_cuda(const float * x, - const int32_t * ids, - void * vy, - [[maybe_unused]] const ggml_type type_src0, - const int64_t ne00, - const int64_t s01, - const int64_t s02, - const int64_t s03, - const int64_t ne0, - const int64_t ne1, - const int64_t ne2, - const int64_t ne3, - cudaStream_t stream) { - GGML_ASSERT(ne0 % (2 * QK_MXFP4) == 0); +void quantize_mmq_fp4_cuda( + const float * x, const int32_t * ids, void * vy, const ggml_type type_src0, + const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, + const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) { + GGML_ASSERT(type_src0 == GGML_TYPE_MXFP4 || type_src0 == GGML_TYPE_NVFP4); + GGML_ASSERT(ne0 > 0); - constexpr int nwarps = 8; - constexpr int vals_per_warp = 2 * QK_MXFP4; - constexpr int vals_per_block = nwarps * vals_per_warp; + if (type_src0 == GGML_TYPE_NVFP4) { + GGML_ASSERT(ne00 % QK_NVFP4 == 0); + constexpr int nvfp4_block_size = 128; + const int64_t block_num_y = (ne0 + QK_NVFP4_SUB * nvfp4_block_size - 1) / (QK_NVFP4_SUB * nvfp4_block_size); + const dim3 block_size(nvfp4_block_size, 1, 1); + const dim3 num_blocks(ne1, block_num_y, ne2 * ne3); + quantize_mmq_nvfp4<<>>( + x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); + } else { + GGML_ASSERT(ne0 % (2 * QK_MXFP4) == 0); - const int64_t block_num_y = (ne0 + vals_per_block - 1) / vals_per_block; - const dim3 num_blocks(ne1, block_num_y, ne2 * ne3); - const dim3 block_size(WARP_SIZE, nwarps, 1); + constexpr int nwarps = 8; + constexpr int vals_per_warp = 2 * QK_MXFP4; + constexpr int vals_per_block = nwarps * vals_per_warp; - quantize_mmq_mxfp4<<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); + const int64_t block_num_y = (ne0 + vals_per_block - 1) / vals_per_block; + const dim3 num_blocks(ne1, block_num_y, ne2 * ne3); + const dim3 block_size(WARP_SIZE, nwarps, 1); + + quantize_mmq_mxfp4<<>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2); + } } diff --git a/ggml/src/ggml-cuda/quantize.cuh b/ggml/src/ggml-cuda/quantize.cuh index 6a91df635..768a3ae6d 100644 --- a/ggml/src/ggml-cuda/quantize.cuh +++ b/ggml/src/ggml-cuda/quantize.cuh @@ -26,7 +26,7 @@ void quantize_mmq_q8_1_cuda( ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream); -void quantize_mmq_mxfp4_cuda(const float * x, +void quantize_mmq_fp4_cuda(const float * x, const int32_t * ids, void * vy, ggml_type type_src0, diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index b4fd2c4dc..941c20ce1 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3815,7 +3815,7 @@ struct test_mul_mat : public test_case { double max_nmse_err(ggml_backend_t backend) override { // for blackwell we quantize activations to mxfp4 instead of q8_1 so we add higher tolerance - if (type_a == GGML_TYPE_MXFP4 && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) { + if ((type_a == GGML_TYPE_MXFP4 || type_a == GGML_TYPE_NVFP4) && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) { return 2e-2; } return max_nmse_err(); @@ -3951,7 +3951,7 @@ struct test_mul_mat_id : public test_case { double max_nmse_err(ggml_backend_t backend) override { // for blackwell we quantize activations to mxfp4 instead of q8_1 so we add higher tolerance - if (type_a == GGML_TYPE_MXFP4 && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) { + if ((type_a == GGML_TYPE_MXFP4 || type_a == GGML_TYPE_NVFP4) && backend_has_feature(backend, "BLACKWELL_NATIVE_FP4")) { return 2e-2; } return max_nmse_err();