models : fix the attn_factor for mistral3 graphs + improve consistency (#17945)

* models : fix the attn_factor for mistral3 graphs * cont : rework attn_factor correction logic * cont : make deepseek2 consistent * cont : add TODO * cont : special-case DSv2 * cont : revert Mistral 3 Large changes * cont : fix DS2 to use the original attn_factor * cont : minor comments
2025-12-12 17:12:40 +02:00
parent dcb7d17758
commit 7bed317f53
7 changed files with 78 additions and 33 deletions
@@ -1,7 +1,9 @@
 #include "llama-hparams.h"

 #include "ggml.h"
+
 #include <cassert>
+#include <cmath>

 void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
    if (dense_first) {
@@ -229,3 +231,13 @@ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama

    return false;
 }
+
+float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor) {
+    GGML_ASSERT(ext_factor >= 0.0f);
+
+    if (ext_factor != 0.0f) {
+        attn_factor *= 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
+    }
+
+    return attn_factor;
+}