model: mistral small 4 support (#20649)

* model: mistral small 4 support

* fix test

* fix test (2)

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* change newline

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
Xuan-Son Nguyen
2026-03-17 00:31:14 +01:00
committed by GitHub
parent 45172df4d6
commit d34ff7eb5b
6 changed files with 133 additions and 42 deletions
+83 -39
View File
@@ -298,11 +298,16 @@ class ModelBase:
scale = scale.float()
if block_size is not None:
dim_offset = scale.ndim - len(block_size)
for i, size in enumerate(block_size):
scale = scale.repeat_interleave(size, i)
scale = scale.repeat_interleave(size, dim_offset + i)
# unpad the scale (e.g. when the tensor size isn't a multiple of the block size)
scale = scale[tuple(slice(0, size) for size in weight.shape)]
# align scale dims to weight for correct broadcasting (e.g. [128] -> [128, 1, 1])
while scale.ndim < weight.ndim:
scale = scale.unsqueeze(-1)
return weight.float() * scale
# ref: https://github.com/ModelCloud/GPTQModel/blob/037c5c0f6c9e33c500d975b038d02e7ca437546d/gptqmodel/nn_modules/qlinear/__init__.py#L437-L476
@@ -393,7 +398,7 @@ class ModelBase:
elif quant_method == "fp8":
block_size = quant_config.get("weight_block_size")
for name in self.model_tensors.keys():
if name.endswith(".weight_scale_inv"):
if name.endswith("_scale_inv"):
weight_name = name.removesuffix("_scale_inv")
w = self.model_tensors[weight_name]
s = self.model_tensors[name]
@@ -401,6 +406,8 @@ class ModelBase:
tensors_to_remove.append(name)
if name.endswith(".activation_scale"): # unused
tensors_to_remove.append(name)
if name.endswith("_activation_scale"): # Mistral-Small-4-119B-2602, unused
tensors_to_remove.append(name)
# mistral format
if name.endswith(".qscale_weight"):
weight_name = name.removesuffix("qscale_weight") + "weight"
@@ -3031,10 +3038,16 @@ class LlavaVisionModel(MmprojModel):
def get_token_id(self, token: str) -> int:
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
added_tokens_decoder = json.load(f)['added_tokens_decoder']
added_tokens_decoder = json.load(f).get('added_tokens_decoder') or {}
for id_, token_data in added_tokens_decoder.items():
if token_data["content"] == token:
if token_data.get("content") == token:
return int(id_)
# fallthrough to tokenizer.json
with open(self.dir_model / "tokenizer.json", "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
for token_data in tokenizer_json["added_tokens"]:
if token_data["content"] == token:
return int(token_data["id"])
raise ValueError(f"Token '{token}' not found in tokenizer config.")
def set_gguf_parameters(self):
@@ -3198,40 +3211,6 @@ class Llama4VisionModel(MmprojModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register(
"Mistral3ForConditionalGeneration",
"Ministral3ForCausalLM",
)
class Mistral3Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.MISTRAL3
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# for compatibility, we use LLAMA arch for older models
# TODO: remove this once everyone has migrated to newer version of llama.cpp
if self.hparams.get("model_type") != "ministral3":
self.model_arch = gguf.MODEL_ARCH.LLAMA
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
self.gguf_writer.add_architecture()
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
def set_gguf_parameters(self):
super().set_gguf_parameters()
rope_params = self.rope_parameters
if self.hparams.get("model_type") == "ministral3":
assert rope_params, "ministral3 must have 'rope_parameters' config"
assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
name = name.replace("language_model.", "")
if "multi_modal_projector" in name or "vision_tower" in name:
return
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("DeciLMForCausalLM")
class DeciModel(TextModel):
model_arch = gguf.MODEL_ARCH.DECI
@@ -8271,6 +8250,8 @@ class DeepseekV2Model(TextModel):
# TODO @ngxson : remove this when we support MTP for deepseek models
skip_mtp = True
merge_expert = True
def set_vocab(self):
try:
self._set_vocab_gpt2()
@@ -8409,7 +8390,7 @@ class DeepseekV2Model(TextModel):
return
# process the experts separately
if name.find("mlp.experts") != -1:
if self.merge_expert and name.find("mlp.experts") != -1:
n_experts = self.hparams["n_routed_experts"]
assert bid is not None
@@ -8468,6 +8449,69 @@ class DeepseekV2Model(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register(
"Mistral3ForConditionalGeneration",
"Ministral3ForCausalLM",
)
class Mistral3Model(TextModel):
class Ministral3Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.MISTRAL3
def set_gguf_parameters(self):
super().set_gguf_parameters()
rope_params = self.rope_parameters
if self.hparams.get("model_type") == "ministral3":
assert rope_params, "ministral3 must have 'rope_parameters' config"
assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
name = name.replace("language_model.", "")
if "multi_modal_projector" in name or "vision_tower" in name:
return
yield from super().modify_tensors(data_torch, name, bid)
class Mistral4Model(DeepseekV2Model):
model_arch = gguf.MODEL_ARCH.MISTRAL4
skip_mtp = False # model contains no MTP layers, so no need to skip
merge_expert = False # experts are already stacked as 3D
def modify_tensors(self, data_torch, name, bid):
if name.endswith(".down_proj") or name.endswith(".gate_up_proj"):
name = name + ".weight"
yield from super().modify_tensors(data_torch, name, bid)
model_arch = gguf.MODEL_ARCH.MISTRAL3 # unused
impl: TextModel
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.hparams.get("model_type") == "mistral4":
self.impl = Mistral3Model.Mistral4Model(*args, **kwargs)
else:
self.impl = Mistral3Model.Ministral3Model(*args, **kwargs)
def set_vocab(self):
self.impl.set_vocab()
def set_gguf_parameters(self):
self.impl.set_gguf_parameters()
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
yield from self.impl.modify_tensors(data_torch, name, bid)
def prepare_tensors(self):
self.impl.prepare_tensors()
def write_vocab(self):
self.impl.write_vocab()
def write(self):
self.impl.write()
@ModelBase.register("MiniMaxM2ForCausalLM")
class MiniMaxM2Model(TextModel):
model_arch = gguf.MODEL_ARCH.MINIMAXM2