diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index ef2a4fddc..08ff1e362 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -675,6 +675,10 @@ private: int32_t n_ctx; // total context for all clients / slots + // set to llama_model_n_swa(model) + // if swa_full is enabled, this is set to 0 to simulate a non-SWA model + int32_t n_swa; + // slots / clients std::vector slots; @@ -854,6 +858,8 @@ private: } } + n_swa = params_base.swa_full ? 0 : llama_model_n_swa(model); + // Necessary similarity of prompt for slot selection slot_prompt_similarity = params_base.slot_prompt_similarity; @@ -2415,9 +2421,6 @@ private: llama_pos pos_next = slot.prompt.tokens.pos_next(n_past); - // note: when n_swa == 0, the model does not use SWA - const auto n_swa = std::max(0, llama_model_n_swa(model)); - // the largest pos_min required for a checkpoint to be useful const auto pos_min_thold = std::max(0, pos_next - n_swa); @@ -2589,10 +2592,10 @@ private: // make a checkpoint of the parts of the memory that cannot be rolled back. // checkpoints are created only if: // - the model does not support partial sequence removal - // - the model uses SWA and we are not using `swa_full` + // - the model uses SWA (and we are not using `swa_full`) do_checkpoint = do_checkpoint && ( (slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) || - (llama_model_n_swa(model) > 0 && !params_base.swa_full)); + (n_swa > 0)); bool has_mtmd = false;