server : fix swa-full logic (#22288)
This commit is contained in:
@@ -675,6 +675,10 @@ private:
|
||||
|
||||
int32_t n_ctx; // total context for all clients / slots
|
||||
|
||||
// set to llama_model_n_swa(model)
|
||||
// if swa_full is enabled, this is set to 0 to simulate a non-SWA model
|
||||
int32_t n_swa;
|
||||
|
||||
// slots / clients
|
||||
std::vector<server_slot> slots;
|
||||
|
||||
@@ -854,6 +858,8 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
n_swa = params_base.swa_full ? 0 : llama_model_n_swa(model);
|
||||
|
||||
// Necessary similarity of prompt for slot selection
|
||||
slot_prompt_similarity = params_base.slot_prompt_similarity;
|
||||
|
||||
@@ -2415,9 +2421,6 @@ private:
|
||||
|
||||
llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);
|
||||
|
||||
// note: when n_swa == 0, the model does not use SWA
|
||||
const auto n_swa = std::max(0, llama_model_n_swa(model));
|
||||
|
||||
// the largest pos_min required for a checkpoint to be useful
|
||||
const auto pos_min_thold = std::max(0, pos_next - n_swa);
|
||||
|
||||
@@ -2589,10 +2592,10 @@ private:
|
||||
// make a checkpoint of the parts of the memory that cannot be rolled back.
|
||||
// checkpoints are created only if:
|
||||
// - the model does not support partial sequence removal
|
||||
// - the model uses SWA and we are not using `swa_full`
|
||||
// - the model uses SWA (and we are not using `swa_full`)
|
||||
do_checkpoint = do_checkpoint && (
|
||||
(slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) ||
|
||||
(llama_model_n_swa(model) > 0 && !params_base.swa_full));
|
||||
(n_swa > 0));
|
||||
|
||||
bool has_mtmd = false;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user