graph : fix nkvo offload with FA (#19105)

This commit is contained in:
Georgi Gerganov
2026-01-26 20:18:34 +02:00
committed by GitHub
parent 142cbe2ac6
commit 8f80d1b254
2 changed files with 5 additions and 7 deletions
+5
View File
@@ -1630,6 +1630,11 @@ ggml_tensor * llm_graph_context::build_attn_mha(
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
if (!cparams.offload_kqv) {
// all nodes between the KV store and the attention output are run on the CPU
ggml_backend_sched_set_tensor_backend(sched, cur, backend_cpu);
}
ggml_flash_attn_ext_add_sinks(cur, sinks);
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);