llama: Add option to merge gate and exp weights (#19139)
* llama: Add option to merge gate and exp weights * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * update constants.py * add gate_up for the all MoE models * convert: simplify merge tensor condition * update constants.py * reduce number of models, add create_tensor_gate_up helper --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
@@ -218,7 +218,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
||||
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||
hparams.expert_weights_scale, hparams.expert_weights_scale,
|
||||
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
||||
il);
|
||||
il,
|
||||
nullptr,
|
||||
model.layers[il].ffn_gate_up_exps);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
// FFN shared expert
|
||||
|
||||
@@ -380,7 +380,8 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_ffn(ggml_tensor * cur, const int
|
||||
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used, LLM_FFN_SILU,
|
||||
true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
||||
true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
|
||||
nullptr, model.layers[il].ffn_gate_up_exps);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
// Add shared experts if present - following Qwen3Next reference implementation
|
||||
|
||||
@@ -479,7 +479,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int
|
||||
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
|
||||
nullptr,
|
||||
n_expert, n_expert_used, LLM_FFN_SILU,
|
||||
true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
||||
true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il,
|
||||
nullptr, model.layers[il].ffn_gate_up_exps);
|
||||
cb(moe_out, "ffn_moe_out", il);
|
||||
|
||||
// Add shared experts if present - following Qwen3Next reference implementation
|
||||
|
||||
Reference in New Issue
Block a user