models : fix the attn_factor for mistral3 graphs + improve consistency (#17945)

* models : fix the attn_factor for mistral3 graphs

* cont : rework attn_factor correction logic

* cont : make deepseek2 consistent

* cont : add TODO

* cont : special-case DSv2

* cont : revert Mistral 3 Large changes

* cont : fix DS2 to use the original attn_factor

* cont : minor comments
This commit is contained in:
Georgi Gerganov
2025-12-12 17:12:40 +02:00
committed by GitHub
parent dcb7d17758
commit 7bed317f53
7 changed files with 78 additions and 33 deletions
+9 -5
View File
@@ -1,7 +1,5 @@
#include "models.h"
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
@@ -20,9 +18,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
// And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
// first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
GGML_ASSERT(ext_factor >= 0.0f);
const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
// use the original attn_factor to pre-scale the kq_scale
const float mscale = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
ggml_tensor * cur;
ggml_tensor * inpL;