ggml-cpu: FA split across kv for faster TG (#19209)

* ggml-cpu: split across kv for faster TG

* simplify sinks application

* add ref impl
This commit is contained in:
Aman Gupta
2026-02-03 01:19:55 +08:00
committed by GitHub
parent a3fa035822
commit 9f682fb640
6 changed files with 220 additions and 69 deletions
+7
View File
@@ -8591,6 +8591,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
output_printer->print_operation(info);
return false;
}
// Use reference implementation on the CPU backend for comparison
using ggml_backend_cpu_set_use_ref_t = void (*)(ggml_backend_t, bool);
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
auto * set_use_ref = (ggml_backend_cpu_set_use_ref_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_use_ref");
if (set_use_ref) {
set_use_ref(backend_cpu, true);
}
size_t n_ok = 0;
size_t tests_run = 0;