ggml-cuda: Add generic NVFP4 MMQ kernel (#21074)
* Introduced NVFP4 generic MMQ kernel * Added extra FP8 guard, hope to solve ci HIP failure * Rename tiles and use HIP_FP8_AVAILABLE * Removed remaning FP8 straggler and added const int * Const * Removed DECL_MMQ_CASE artifact * Removed newline * Removed space after else * Changed HIP FP8 NVFP4 conversion gate * Added new line to bottom of mmq.cu 270 * Removed extra spaces * Removed single space in front of else on line 814 * Added NVFP4 to generate cu script so HIP can see it, further tightened logic * Include generated mmq-instance-nvfp4.cu * Added NVFP4 mmq to HIP Check ignore list * Update ggml/src/ggml-cuda/mmq.cuh Changed to Q3_K tile to read MMQ_MMA_TILE_X_K_NVFP4 Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Update ggml/src/ggml-cuda/mmq.cuh Changed to Q3_K tile to read MMQ_MMA_TILE_X_K_NVFP4 in tile assert Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Update ggml/src/ggml-cuda/mmq.cuh Added function name ending for end if Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Added function names to closing endif Co-authored-by: Johannes Gäßler <johannesg@5d6.de> --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
@@ -800,19 +800,32 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) {
|
||||
#ifdef FP8_AVAILABLE
|
||||
const uint32_t bits = x * (x != 0x7F && x != 0xFF); // Convert NaN to 0.0f to match CPU implementation.
|
||||
#if defined(GGML_USE_HIP) && defined(CDNA3)
|
||||
// ROCm dose not support fp8 in software on devices with fp8 hardware,
|
||||
#if defined(GGML_USE_HIP) && defined(CDNA3) && defined(FP8_AVAILABLE) && HIP_VERSION >= 60200000
|
||||
// ROCm does not support fp8 in software on devices with fp8 hardware,
|
||||
// but CDNA3 supports only e4m3_fnuz (no inf).
|
||||
const uint32_t bits = x * (x != 0x7F && x != 0xFF); // Convert NaN to 0.0f to match CPU implementation.
|
||||
const __hip_fp8_e4m3_fnuz xf = *reinterpret_cast<const __hip_fp8_e4m3_fnuz *>(&bits);
|
||||
#else
|
||||
const __nv_fp8_e4m3 xf = *reinterpret_cast<const __nv_fp8_e4m3 *>(&bits);
|
||||
#endif // defined(GGML_USE_HIP) && defined(GGML_USE_HIP)
|
||||
return static_cast<float>(xf) / 2;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FP8_AVAILABLE
|
||||
#if defined(FP8_AVAILABLE) && !defined(GGML_USE_HIP)
|
||||
const uint32_t bits = x * (x != 0x7F && x != 0xFF); // Convert NaN to 0.0f to match CPU implementation.
|
||||
const __nv_fp8_e4m3 xf = *reinterpret_cast<const __nv_fp8_e4m3 *>(&bits);
|
||||
return static_cast<float>(xf) / 2;
|
||||
#else
|
||||
if (x == 0 || (x == 0x7F && x != 0xFF)) { // Convert NaN to 0.0f
|
||||
return 0.0f;
|
||||
}
|
||||
const int exp = (x >> 3) & 0xF;
|
||||
const int man = x & 0x7;
|
||||
float raw;
|
||||
if (exp == 0) {
|
||||
raw = ldexpf((float) man, -9);
|
||||
} else {
|
||||
raw = ldexpf(1.0f + (float) man / 8.0f, exp - 7);
|
||||
}
|
||||
return static_cast<float>(raw / 2);
|
||||
#endif // defined(FP8_AVAILABLE) && !defined(GGML_USE_HIP)
|
||||
#endif // defined(GGML_USE_HIP) && defined(CDNA3) && defined(FP8_AVAILABLE) && HIP_VERSION >= 60200000
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) {
|
||||
|
||||
Reference in New Issue
Block a user