fix(metal): correct Q4_0 contiguous kernel nibble extraction
CI (apple) / macOS-latest-ios (pull_request) Waiting to run
CI (apple) / macos-latest-ios-xcode (pull_request) Waiting to run
CI (apple) / macOS-latest-tvos (pull_request) Waiting to run
CI (apple) / macOS-latest-visionos (pull_request) Waiting to run
CI (apple) / macOS-latest-swift (generic/platform=iOS) (pull_request) Blocked by required conditions
CI (apple) / macOS-latest-swift (generic/platform=macOS) (pull_request) Blocked by required conditions
CI (apple) / macOS-latest-swift (generic/platform=tvOS) (pull_request) Blocked by required conditions
CI (self-hosted) / ggml-ci-nvidia-cuda (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-nvidia-vulkan-cm (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-nvidia-vulkan-cm2 (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-mac-metal (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-mac-webgpu (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-mac-vulkan (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-linux-intel-vulkan (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-win-intel-vulkan (pull_request) Waiting to run
CI (self-hosted) / ggml-ci-intel-openvino-gpu-low-perf (pull_request) Waiting to run
CI / build-cmake-pkg (pull_request) Waiting to run
CI / macOS-latest-arm64 (pull_request) Waiting to run
CI / macOS-latest-x64 (pull_request) Waiting to run
CI / macOS-latest-arm64-webgpu (pull_request) Waiting to run
CI / ubuntu-cpu (arm64, ubuntu-24.04-arm) (pull_request) Waiting to run
CI / ubuntu-cpu (ppc64le, ubuntu-24.04-ppc64le) (pull_request) Waiting to run
CI / ubuntu-cpu (s390x, ubuntu-24.04-s390x) (pull_request) Waiting to run
CI / ubuntu-cpu (x64, ubuntu-22.04) (pull_request) Waiting to run
CI / android-arm64 (pull_request) Waiting to run
CI / ubuntu-latest-rpc (pull_request) Waiting to run
CI / ubuntu-24-vulkan (arm64, ubuntu-24.04-arm) (pull_request) Waiting to run
CI / ubuntu-24-vulkan (x64, ubuntu-24.04) (pull_request) Waiting to run
CI / ubuntu-24-webgpu (pull_request) Waiting to run
CI / ubuntu-24-webgpu-wasm (pull_request) Waiting to run
CI / ubuntu-22-hip (pull_request) Waiting to run
CI / ubuntu-22-musa (pull_request) Waiting to run
CI / windows-latest (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (pull_request) Waiting to run
CI / windows-latest (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (pull_request) Waiting to run
CI / windows-latest (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (pull_request) Waiting to run
CI / windows-latest (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DG… (pull_request) Waiting to run
CI / windows-latest (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (pull_request) Waiting to run
CI / ubuntu-latest-cuda (pull_request) Waiting to run
CI / windows-2022-cuda (12.4) (pull_request) Waiting to run
CI / windows-latest-hip (pull_request) Waiting to run
CI / ubuntu-cpu-riscv64-native (pull_request) Waiting to run
CI / ggml-ci-x64-cpu-low-perf (pull_request) Waiting to run
CI / ggml-ci-arm64-cpu-low-perf (pull_request) Waiting to run
CI / ggml-ci-x64-cpu-high-perf (pull_request) Waiting to run
CI / ggml-ci-arm64-cpu-high-perf (pull_request) Waiting to run
CI / ggml-ci-arm64-cpu-high-perf-sve (pull_request) Waiting to run
CI / ggml-ci-arm64-cpu-kleidiai (pull_request) Waiting to run
CI / ggml-ci-arm64-cpu-kleidiai-graviton4 (pull_request) Waiting to run
EditorConfig Checker / editorconfig (pull_request) Waiting to run
Server / server (default) (pull_request) Waiting to run
Server / server (backend-sampling) (pull_request) Waiting to run
Server / server-windows (pull_request) Waiting to run
Pull Request Labeler / labeler (pull_request_target) Waiting to run

- Extract all 8 nibbles per uint32_t with proper bit shifts
- Use il-based offset for uint32_t selection (qs[il/8] and qs[il/8+2])
- Apply bias correction once per block instead of 4x accumulated
This commit is contained in:
Kaloyan Nikolov
2026-05-01 00:13:56 +02:00
parent 06f05e71c1
commit 31ce8b1ae5
+19 -32
View File
@@ -3575,49 +3575,36 @@ kernel void kernel_mul_mv_q4_0_f32_c(
const short il = (tiisg % (NW / 16)) * 8;
const int ib0 = ix;
const uint q_off = il / 8;
device const float * yb = y + ib0 * QK4_0 + il;
for (int ib = ib0; ib < nb; ib += 16) {
float sumy[2] = {0.f, 0.f};
float sumy = 0.f;
FOR_UNROLL (short i = 0; i < 8; i += 2) {
sumy[0] += yb[i + 0] + yb[i + 1];
float yl0 = yb[i + 0];
float yl1 = yb[i + 1] / 256.f;
sumy += yb[i + 0] + yb[i + 1] + yb[i + 16] + yb[i + 17];
}
sumy[1] += yb[i + 16] + yb[i + 17];
float yl8 = yb[i + 16] / 16.f;
float yl9 = yb[i + 17] / 4096.f;
FOR_UNROLL (short row = 0; row < NR0; row++) {
const float d = ax[row][ib].d;
device const uint32_t * qs = (device const uint32_t *) (ax[row][ib].qs);
FOR_UNROLL (short row = 0; row < NR0; row++) {
const float d = ax[row][ib].d;
const uint32_t q0 = qs[q_off];
const uint32_t q1 = qs[q_off + 2];
device const uint32_t * qs = (device const uint32_t *) (ax[row][ib].qs);
float acc = 0.f;
float a0 = 0.f, a1 = 0.f, a2 = 0.f, a3 = 0.f;
FOR_UNROLL (short i = 0; i < 8; i += 2) {
const uint ni = i / 2;
a0 += yl0 * (qs[0] & 0x0000000F);
a1 += yl1 * (qs[0] & 0x00000F00);
a2 += yl8 * (qs[0] & 0x0000F000);
a3 += yl9 * (qs[0] & 0x000F0000);
a0 += yl0 * (qs[1] & 0x0000000F);
a1 += yl1 * (qs[1] & 0x00000F00);
a2 += yl8 * (qs[1] & 0x0000F000);
a3 += yl9 * (qs[1] & 0x000F0000);
a0 += yl0 * (qs[2] & 0x0000000F);
a1 += yl1 * (qs[2] & 0x00000F00);
a2 += yl8 * (qs[2] & 0x0000F000);
a3 += yl9 * (qs[2] & 0x000F0000);
a0 += yl0 * (qs[3] & 0x0000000F);
a1 += yl1 * (qs[3] & 0x00000F00);
a2 += yl8 * (qs[3] & 0x0000F000);
a3 += yl9 * (qs[3] & 0x000F0000);
sumf[row] += d * (sumy[0] + sumy[1]) * -8.f + d * (a0 + a1 + a2 + a3);
acc += ((q0 >> (4 * ni)) & 0xF) * yb[i + 0]
+ ((q0 >> (4 * (ni + 1))) & 0xF) * yb[i + 1]
+ ((q1 >> (4 * ni)) & 0xF) * yb[i + 16]
+ ((q1 >> (4 * (ni + 1))) & 0xF) * yb[i + 17];
}
sumf[row] += d * (acc + sumy * -8.f);
}
yb += QK4_0 * 16;