server : fix swa-full logic (#22288)

2026-04-24 10:17:37 +03:00
parent 793d0a7931
commit ffdd983fb8
1 changed files with 8 additions and 5 deletions
@@ -675,6 +675,10 @@ private:

    int32_t n_ctx; // total context for all clients / slots

+    // set to llama_model_n_swa(model)
+    // if swa_full is enabled, this is set to 0 to simulate a non-SWA model
+    int32_t n_swa;
+
    // slots / clients
    std::vector<server_slot> slots;

@@ -854,6 +858,8 @@ private:
            }
        }

+        n_swa = params_base.swa_full ? 0 : llama_model_n_swa(model);
+
        // Necessary similarity of prompt for slot selection
        slot_prompt_similarity = params_base.slot_prompt_similarity;

@@ -2415,9 +2421,6 @@ private:

                            llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);

-                            // note: when n_swa == 0, the model does not use SWA
-                            const auto n_swa = std::max(0, llama_model_n_swa(model));
-
                            // the largest pos_min required for a checkpoint to be useful
                            const auto pos_min_thold = std::max(0, pos_next - n_swa);

@@ -2589,10 +2592,10 @@ private:
                    // make a checkpoint of the parts of the memory that cannot be rolled back.
                    // checkpoints are created only if:
                    // - the model does not support partial sequence removal
-                    // - the model uses SWA and we are not using `swa_full`
+                    // - the model uses SWA (and we are not using `swa_full`)
                    do_checkpoint = do_checkpoint && (
                            (slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) ||
-                            (llama_model_n_swa(model) > 0 && !params_base.swa_full));
+                            (n_swa > 0));

                    bool has_mtmd = false;