vulkan: support flash attention GQA/split_k with small batches (#18938)

2026-01-21 10:43:43 -06:00
parent 067b8d7af3
commit 33f890e579
7 changed files with 74 additions and 44 deletions
@@ -8460,6 +8460,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
    // Qwen3-VL-8B https://github.com/ggml-org/llama.cpp/issues/17012
    test_cases.emplace_back(new test_flash_attn_ext(72, 72, 16, {1, 1}, 5776, 5776, false, false, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));

+    test_cases.emplace_back(new test_flash_attn_ext(64, 64, 8, {8, 1}, 7680, 1, true, false, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
+    test_cases.emplace_back(new test_flash_attn_ext(64, 64, 8, {8, 1}, 7680, 4, true, false, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
+
    for (int kv : { 4096, 8192, 16384, }) {
        for (int hs : { 64, 128, }) {
            for (int nr : { 1, 4, }) {