llama: end-to-end tests (#19802)

* tests: add end-to-end tests per model architecture * fixup for rebase * fix use-after-free in llama-model-loader.cpp * fix CI * fix WebGPU * fix CI * disable CI for macOS-latest-cmake-arm64 * use expert_weights_scale only if != 0.0f * comments
2026-03-08 12:30:21 +01:00
parent a95047979a
commit a976ff081b
33 changed files with 1607 additions and 633 deletions
@@ -56,6 +56,7 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_grap
                            );
                    break;
                case LLM_TYPE_13B:
+                case LLM_TYPE_UNKNOWN:
                    break;
                default:
                    GGML_ABORT("fatal error");
@@ -90,7 +90,7 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const ll
                model.layers[il].ffn_exp_probs_b,
                n_expert, n_expert_used,
                LLM_FFN_SILU, hparams.expert_weights_norm,
-                true, hparams.expert_weights_scale,
+                hparams.expert_weights_scale, hparams.expert_weights_scale,
                (llama_expert_gating_func_type) hparams.expert_gating_func,
                il);
            cb(moe_out, "ffn_moe_out", il);
@@ -91,7 +91,7 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_para
                model.layers[il].ffn_exp_probs_b,
                n_expert, n_expert_used,
                LLM_FFN_SILU, hparams.expert_weights_norm,
-                true, hparams.expert_weights_scale,
+                hparams.expert_weights_scale, hparams.expert_weights_scale,
                (llama_expert_gating_func_type) hparams.expert_gating_func,
                il);
            cb(moe_out, "ffn_moe_out", il);
@@ -100,7 +100,7 @@ llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_
                model.layers[il].ffn_exp_probs_b,
                n_expert, n_expert_used,
                LLM_FFN_SILU, hparams.expert_weights_norm,
-                true, hparams.expert_weights_scale,
+                hparams.expert_weights_scale, hparams.expert_weights_scale,
                (llama_expert_gating_func_type) hparams.expert_gating_func,
                il);
            cb(moe_out, "ffn_moe_out", il);
@@ -128,7 +128,7 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
                    model.layers[il].ffn_exp_probs_b,
                    n_expert, n_expert_used,
                    LLM_FFN_SILU, hparams.expert_weights_norm,
-                    true, hparams.expert_weights_scale,
+                    hparams.expert_weights_scale, hparams.expert_weights_scale,
                    (llama_expert_gating_func_type) hparams.expert_gating_func,
                    il);
            cb(routed_out, "ffn_moe_out", il);
@@ -118,12 +118,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll

        ggml_build_forward_expand(gf, cur);

-        // Check layer type by checking which tensors exist
-        // KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor
-        bool is_kda = (layer.ssm_a != nullptr);
-        bool is_mla = (layer.wkv_a_mqa != nullptr);
-
-        if (is_kda) {
+        if (hparams.is_recurrent(il)) {
            // === KDA Layer (Kimi Delta Attention) with Recurrent State ===
            // Reference: vLLM kda.py
            const auto * mctx_cur = inp_rs->mctx;
@@ -211,7 +206,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
            cur = ggml_mul_mat(ctx0, layer.wo, gated);
            cb(cur, "kda_out", il);

-        } else if (is_mla) {
+        } else {
            // === MLA Layer (Multi-head Latent Attention) without KV Cache ===
            // Reference: vLLM mla.py
            // Step 1: Q projection and reshape
@@ -310,9 +305,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                cur = build_attn(inp_attn_kv, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
                cb(cur, "mla_out", il);
            }
-        } else {
-            // Unknown layer type - this should not happen
-            GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors");
        }

        // On last layer, select only the output tokens
@@ -349,7 +341,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
                hparams.n_expert,
                hparams.n_expert_used,
                LLM_FFN_SILU, true,
-                true, hparams.expert_weights_scale,
+                hparams.expert_weights_scale, hparams.expert_weights_scale,
                (llama_expert_gating_func_type) hparams.expert_gating_func,
                il);
            cb(moe_out, "ffn_moe_out", il);
@@ -30,6 +30,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba_layer(llm_graph_input_rs * inp,
    GGML_ASSERT(n_seqs != 0);
    GGML_ASSERT(ubatch.equal_seqs());
    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+    GGML_ASSERT(d_inner % n_head == 0);

    ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
    ggml_tensor * ssm_states_all  = mctx_cur->get_s_l(il);
@@ -154,6 +155,7 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,

    const auto kv_head = mctx_cur->get_head();

+    const int64_t n_embd   = hparams.n_embd;
    const int64_t d_conv   = hparams.ssm_d_conv;
    const int64_t d_inner  = hparams.ssm_d_inner;
    const int64_t d_state  = hparams.ssm_d_state;
@@ -167,6 +169,8 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp,
    GGML_ASSERT(n_seqs != 0);
    GGML_ASSERT(ubatch.equal_seqs());
    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+    GGML_ASSERT(d_inner % n_head == 0);
+    GGML_ASSERT(d_inner % (n_group*n_embd) == 0);

    ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
    ggml_tensor * ssm_states_all  = mctx_cur->get_s_l(il);
@@ -124,7 +124,7 @@ ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const lla
                    model.layers[il].ffn_exp_probs_b,
                    n_expert, n_expert_used,
                    LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
-                    true, hparams.expert_weights_scale,
+                    hparams.expert_weights_scale, hparams.expert_weights_scale,
                    LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
                    il);
        cb(moe_out, "ffn_moe_out", il);
@@ -27,7 +27,7 @@ llm_build_plamo2::llm_build_plamo2(const llama_model & model, const llm_graph_pa
        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);

        // check if this layer is Mamba or Attention
-        bool is_mamba_layer = hparams.is_recurrent(il);
+        const bool is_mamba_layer = hparams.is_recurrent(il);

        if (is_mamba_layer) {
            // PLaMo-2 Mamba layer
@@ -171,6 +171,8 @@ ggml_tensor * llm_build_plamo2::build_plamo2_mamba_layer(llm_graph_input_rs * in
    GGML_ASSERT(n_seqs != 0);
    GGML_ASSERT(ubatch.equal_seqs());
    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
+    GGML_ASSERT(d_inner % n_head == 0);
+    GGML_ASSERT(n_group == 0);

    ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
    ggml_tensor * ssm_states_all  = mctx_cur->get_s_l(il);