llama : add support for Nemotron 3 Super (#20411)

* llama : add support for Nemotron 3 Super

This commit adds support for the Nemotron 3 Super model (120B.A12B)
enabling this model to be converted to GGUF format and run in llama.cpp.

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Matt Clayton <156335168+mattjcly@users.noreply.github.com>
This commit is contained in:
Daniel Bevenius
2026-03-11 19:27:53 +01:00
committed by GitHub
parent 76ea1c1c46
commit eaf1d7930c
11 changed files with 97 additions and 14 deletions
+8
View File
@@ -125,6 +125,7 @@ class Keys:
EXPERT_GROUP_SCALE = "{arch}.expert_group_scale"
EXPERTS_PER_GROUP = "{arch}.experts_per_group"
MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers"
MOE_LATENT_SIZE = "{arch}.moe_latent_size"
NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers"
NUM_DEEPSTACK_LAYERS = "{arch}.n_deepstack_layers"
POOLING_TYPE = "{arch}.pooling_type"
@@ -543,6 +544,8 @@ class MODEL_TENSOR(IntEnum):
FFN_DOWN_CHEXP = auto()
FFN_UP_CHEXP = auto()
FFN_EXP_PROBS_B = auto()
MOE_LATENT_DOWN = auto() # nemotron 3 super
MOE_LATENT_UP = auto() # nemotron 3 super
ATTN_Q_NORM = auto()
ATTN_K_NORM = auto()
LAYER_OUT_NORM = auto()
@@ -986,6 +989,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
MODEL_TENSOR.FFN_GATE_UP_EXP: "blk.{bid}.ffn_gate_up_exps",
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
MODEL_TENSOR.MOE_LATENT_DOWN: "blk.{bid}.ffn_latent_down", # nemotron 3 super
MODEL_TENSOR.MOE_LATENT_UP: "blk.{bid}.ffn_latent_up", # nemotron 3 super
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n
@@ -2913,6 +2918,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
# expert latent
MODEL_TENSOR.MOE_LATENT_DOWN,
MODEL_TENSOR.MOE_LATENT_UP,
# shared expert
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
+3
View File
@@ -859,6 +859,9 @@ class GGUFWriter:
def add_moe_every_n_layers(self, value: int) -> None:
self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value)
def add_moe_latent_size(self, value: int) -> None:
self.add_uint32(Keys.LLM.MOE_LATENT_SIZE.format(arch=self.arch), value)
def add_nextn_predict_layers(self, count: int) -> None:
self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count)
+8
View File
@@ -571,6 +571,14 @@ class TensorNameMap:
"model.layers.{bid}.mlp.experts.gate_up_proj",
),
MODEL_TENSOR.MOE_LATENT_DOWN: (
"backbone.layers.{bid}.mixer.fc1_latent_proj", # nemotron 3 super
),
MODEL_TENSOR.MOE_LATENT_UP: (
"backbone.layers.{bid}.mixer.fc2_latent_proj", # nemotron 3 super
),
# Feed-forward down
MODEL_TENSOR.FFN_DOWN: (
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox