model : remove duplicate wo_s scale after build_attn (Qwen3, LLaMA) (#22421)

Signed-off-by: Yash Nankani <ynankani@nvidia.com>
This commit is contained in:
ynankani
2026-04-27 07:58:48 +00:00
committed by GitHub
parent d13540becd
commit 0f1bb602dd
3 changed files with 0 additions and 9 deletions
-3
View File
@@ -72,9 +72,6 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
cur = build_attn(inp_attn, cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s, model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
if (model.layers[il].wo_s) {
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
}
cb(cur, "attn_out", il); cb(cur, "attn_out", il);
} }
if (il == n_layer - 1 && inp_out_ids) { if (il == n_layer - 1 && inp_out_ids) {
-3
View File
@@ -58,9 +58,6 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
cur = build_attn(inp_attn, cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s, model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
if (model.layers[il].wo_s) {
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
}
} }
if (il == n_layer - 1 && inp_out_ids) { if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids); cur = ggml_get_rows(ctx0, cur, inp_out_ids);
-3
View File
@@ -58,9 +58,6 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
cur = build_attn(inp_attn, cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s, model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
if (model.layers[il].wo_s) {
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
}
} }
if (il == n_layer - 1 && inp_out_ids) { if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids); cur = ggml_get_rows(ctx0, cur, inp_out_ids);