model : remove duplicate wo_s scale after build_attn (Qwen3, LLaMA) (#22421)
Signed-off-by: Yash Nankani <ynankani@nvidia.com>
This commit is contained in:
@@ -72,9 +72,6 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
|
|||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
||||||
if (model.layers[il].wo_s) {
|
|
||||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
|
|
||||||
}
|
|
||||||
cb(cur, "attn_out", il);
|
cb(cur, "attn_out", il);
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
|
|||||||
@@ -58,9 +58,6 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
|
|||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
if (model.layers[il].wo_s) {
|
|
||||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
|||||||
@@ -58,9 +58,6 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
|
|||||||
cur = build_attn(inp_attn,
|
cur = build_attn(inp_attn,
|
||||||
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
model.layers[il].wo, model.layers[il].wo_b, model.layers[il].wo_s,
|
||||||
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||||
if (model.layers[il].wo_s) {
|
|
||||||
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (il == n_layer - 1 && inp_out_ids) {
|
if (il == n_layer - 1 && inp_out_ids) {
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
|||||||
Reference in New Issue
Block a user