llama : add support for Nemotron 3 Super (#20411)

* llama : add support for Nemotron 3 Super This commit adds support for the Nemotron 3 Super model (120B.A12B) enabling this model to be converted to GGUF format and run in llama.cpp. Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Matt Clayton <156335168+mattjcly@users.noreply.github.com>
2026-03-11 19:27:53 +01:00
parent 76ea1c1c46
commit eaf1d7930c
11 changed files with 97 additions and 14 deletions
@@ -125,6 +125,7 @@ class Keys:
        EXPERT_GROUP_SCALE                = "{arch}.expert_group_scale"
        EXPERTS_PER_GROUP                 = "{arch}.experts_per_group"
        MOE_EVERY_N_LAYERS                = "{arch}.moe_every_n_layers"
+        MOE_LATENT_SIZE                   = "{arch}.moe_latent_size"
        NEXTN_PREDICT_LAYERS              = "{arch}.nextn_predict_layers"
        NUM_DEEPSTACK_LAYERS              = "{arch}.n_deepstack_layers"
        POOLING_TYPE                      = "{arch}.pooling_type"
@@ -543,6 +544,8 @@ class MODEL_TENSOR(IntEnum):
    FFN_DOWN_CHEXP       = auto()
    FFN_UP_CHEXP         = auto()
    FFN_EXP_PROBS_B      = auto()
+    MOE_LATENT_DOWN      = auto() # nemotron 3 super
+    MOE_LATENT_UP        = auto() # nemotron 3 super
    ATTN_Q_NORM          = auto()
    ATTN_K_NORM          = auto()
    LAYER_OUT_NORM       = auto()
@@ -986,6 +989,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.FFN_UP_EXP:                "blk.{bid}.ffn_up_exps",
    MODEL_TENSOR.FFN_GATE_UP_EXP:           "blk.{bid}.ffn_gate_up_exps",
    MODEL_TENSOR.FFN_EXP_PROBS_B:           "blk.{bid}.exp_probs_b",
+    MODEL_TENSOR.MOE_LATENT_DOWN:           "blk.{bid}.ffn_latent_down",      # nemotron 3 super
+    MODEL_TENSOR.MOE_LATENT_UP:             "blk.{bid}.ffn_latent_up",        # nemotron 3 super
    MODEL_TENSOR.LAYER_OUT_NORM:            "blk.{bid}.layer_output_norm",
    MODEL_TENSOR.PER_LAYER_TOKEN_EMBD:      "per_layer_token_embd",           # gemma3n
    MODEL_TENSOR.PER_LAYER_MODEL_PROJ:      "per_layer_model_proj",           # gemma3n
@@ -2913,6 +2918,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_GATE_INP,
        MODEL_TENSOR.FFN_UP_EXP,
        MODEL_TENSOR.FFN_DOWN_EXP,
+        # expert latent
+        MODEL_TENSOR.MOE_LATENT_DOWN,
+        MODEL_TENSOR.MOE_LATENT_UP,
        # shared expert
        MODEL_TENSOR.FFN_DOWN_SHEXP,
        MODEL_TENSOR.FFN_UP_SHEXP,
@@ -859,6 +859,9 @@ class GGUFWriter:
    def add_moe_every_n_layers(self, value: int) -> None:
        self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value)

+    def add_moe_latent_size(self, value: int) -> None:
+        self.add_uint32(Keys.LLM.MOE_LATENT_SIZE.format(arch=self.arch), value)
+
    def add_nextn_predict_layers(self, count: int) -> None:
        self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count)

@@ -571,6 +571,14 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.experts.gate_up_proj",
        ),

+        MODEL_TENSOR.MOE_LATENT_DOWN: (
+            "backbone.layers.{bid}.mixer.fc1_latent_proj",                 # nemotron 3 super
+        ),
+
+        MODEL_TENSOR.MOE_LATENT_UP: (
+            "backbone.layers.{bid}.mixer.fc2_latent_proj",                 # nemotron 3 super
+        ),
+
        # Feed-forward down
        MODEL_TENSOR.FFN_DOWN: (
            "gpt_neox.layers.{bid}.mlp.dense_4h_to_h",                # gptneox