ggml : use 64 bytes aligned tile buffers (#21058)

| Model | Test | t/s OLD | t/s NEW | Speedup | |:---------------------------------|:-------|----------:|----------:|----------:| | qwen35 0.8B BF16 | pp512 | 584.59 | 595.41 | 1.02 | | qwen35 0.8B BF16 | tg128 | 52.23 | 52.82 | 1.01 | | qwen35 0.8B IQ2_M - 2.7 bpw | pp512 | 260.64 | 261.70 | 1.00 | | qwen35 0.8B IQ2_M - 2.7 bpw | tg128 | 81.17 | 80.89 | 1.00 | | qwen35 0.8B IQ2_XXS - 2.0625 bpw | pp512 | 302.36 | 302.56 | 1.00 | | qwen35 0.8B IQ2_XXS - 2.0625 bpw | tg128 | 84.93 | 85.12 | 1.00 | | qwen35 0.8B IQ3_XXS - 3.0625 bpw | pp512 | 263.22 | 260.01 | 0.99 | | qwen35 0.8B IQ3_XXS - 3.0625 bpw | tg128 | 80.29 | 78.94 | 0.98 | | qwen35 0.8B IQ4_NL - 4.5 bpw | pp512 | 728.65 | 742.09 | 1.02 | | qwen35 0.8B IQ4_NL - 4.5 bpw | tg128 | 82.39 | 84.46 | 1.03 | | qwen35 0.8B IQ4_XS - 4.25 bpw | pp512 | 681.33 | 677.06 | 0.99 | | qwen35 0.8B IQ4_XS - 4.25 bpw | tg128 | 80.18 | 79.28 | 0.99 | | qwen35 0.8B Q2_K_M | pp512 | 413.28 | 415.94 | 1.01 | | qwen35 0.8B Q2_K_M | tg128 | 81.90 | 82.78 | 1.01 | | qwen35 0.8B Q3_K_M | pp512 | 493.17 | 495.08 | 1.00 | | qwen35 0.8B Q3_K_M | tg128 | 82.75 | 83.23 | 1.01 | | qwen35 0.8B Q3_K_S | pp512 | 429.35 | 427.64 | 1.00 | | qwen35 0.8B Q3_K_S | tg128 | 86.69 | 87.02 | 1.00 | | qwen35 0.8B Q4_0 | pp512 | 783.46 | 782.32 | 1.00 | | qwen35 0.8B Q4_0 | tg128 | 88.23 | 87.90 | 1.00 | | qwen35 0.8B Q4_1 | pp512 | 741.71 | 729.76 | 0.98 | | qwen35 0.8B Q4_1 | tg128 | 85.44 | 86.01 | 1.01 | | qwen35 0.8B Q4_K_M | pp512 | 676.24 | 681.31 | 1.01 | | qwen35 0.8B Q4_K_M | tg128 | 76.59 | 77.06 | 1.01 | | qwen35 0.8B Q4_K_S | pp512 | 683.12 | 688.81 | 1.01 | | qwen35 0.8B Q4_K_S | tg128 | 80.50 | 81.19 | 1.01 | | qwen35 0.8B Q5_K_M | pp512 | 635.33 | 642.11 | 1.01 | | qwen35 0.8B Q5_K_M | tg128 | 72.07 | 72.49 | 1.01 | | qwen35 0.8B Q5_K_S | pp512 | 660.95 | 658.18 | 1.00 | | qwen35 0.8B Q5_K_S | tg128 | 72.19 | 72.95 | 1.01 | | qwen35 0.8B Q6_K | pp512 | 647.97 | 638.84 | 0.99 | | qwen35 0.8B Q6_K | tg128 | 72.83 | 72.49 | 1.00 | | qwen35 0.8B Q8_0 | pp512 | 805.01 | 785.49 | 0.98 | | qwen35 0.8B Q8_0 | tg128 | 70.10 | 70.13 | 1.00 | Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-04-27 08:30:55 +02:00
parent 5594d13224
commit f84270ea10
1 changed files with 16 additions and 16 deletions
@@ -2005,12 +2005,12 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
    const int lda = KB * sizeof(TA);
    //const int ldb = KB * sizeof(TB);

-    static thread_local packed_B_t Tile0[TILE_N * TILE_K];
-    static thread_local packed_B_t Tile1[TILE_N * TILE_K];
-    static thread_local int8_t Tile23[TILE_M * TILE_K];
+    alignas(64) static thread_local packed_B_t Tile0[TILE_N * TILE_K];
+    alignas(64) static thread_local packed_B_t Tile1[TILE_N * TILE_K];
+    alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K];

-    static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
-    static thread_local int32_t TileC1[TILE_M * TILE_N * 4];
+    alignas(64) static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
+    alignas(64) static thread_local int32_t TileC1[TILE_M * TILE_N * 4];

    // double buffering C to interleave avx512 and amx
    int32_t * C_cur = TileC0;
@@ -2187,21 +2187,21 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
    const int m1 = std::max(M - TILE_M, 0);
    //const int lda = KB * sizeof(TA);

-    static thread_local int8_t Tile0[TILE_N * TILE_K];
-    static thread_local int8_t Tile1[TILE_N * TILE_K];
-    static thread_local int8_t Tile23[TILE_M * TILE_K];
+    alignas(64) static thread_local int8_t Tile0[TILE_N * TILE_K];
+    alignas(64) static thread_local int8_t Tile1[TILE_N * TILE_K];
+    alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K];

    // mat mul result for each group
-    static thread_local int32_t Tile4[TILE_M * TILE_N];
-    static thread_local int32_t Tile5[TILE_M * TILE_N];
-    static thread_local int32_t Tile6[TILE_M * TILE_N];
-    static thread_local int32_t Tile7[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Tile4[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Tile5[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Tile6[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Tile7[TILE_M * TILE_N];

    // sum of each QK_K block, contains 8 groups, int32
-    static thread_local int32_t Sumi4[TILE_M * TILE_N];
-    static thread_local int32_t Sumi5[TILE_M * TILE_N];
-    static thread_local int32_t Sumi6[TILE_M * TILE_N];
-    static thread_local int32_t Sumi7[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Sumi4[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Sumi5[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Sumi6[TILE_M * TILE_N];
+    alignas(64) static thread_local int32_t Sumi7[TILE_M * TILE_N];

    const int k_group_size = std::is_same<TB, block_q6_K>::value ? 16 : 32;
    for (int i = 0; i < KB; ++i) {