server : fix swa-full logic (#22288)

This commit is contained in:
Georgi Gerganov
2026-04-24 10:17:37 +03:00
committed by GitHub
parent 793d0a7931
commit ffdd983fb8
+8 -5
View File
@@ -675,6 +675,10 @@ private:
int32_t n_ctx; // total context for all clients / slots
// set to llama_model_n_swa(model)
// if swa_full is enabled, this is set to 0 to simulate a non-SWA model
int32_t n_swa;
// slots / clients
std::vector<server_slot> slots;
@@ -854,6 +858,8 @@ private:
}
}
n_swa = params_base.swa_full ? 0 : llama_model_n_swa(model);
// Necessary similarity of prompt for slot selection
slot_prompt_similarity = params_base.slot_prompt_similarity;
@@ -2415,9 +2421,6 @@ private:
llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);
// note: when n_swa == 0, the model does not use SWA
const auto n_swa = std::max(0, llama_model_n_swa(model));
// the largest pos_min required for a checkpoint to be useful
const auto pos_min_thold = std::max(0, pos_next - n_swa);
@@ -2589,10 +2592,10 @@ private:
// make a checkpoint of the parts of the memory that cannot be rolled back.
// checkpoints are created only if:
// - the model does not support partial sequence removal
// - the model uses SWA and we are not using `swa_full`
// - the model uses SWA (and we are not using `swa_full`)
do_checkpoint = do_checkpoint && (
(slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) ||
(llama_model_n_swa(model) > 0 && !params_base.swa_full));
(n_swa > 0));
bool has_mtmd = false;