llama : add adaptive-p sampler (#17927)
* initial commit for branch * simplify constants * add params to `struct common_params_sampling`, add reference to PR * explicitly clamp `min_target` and `max_target` to `[0.0, 1.0]` * add args, rename `queue_size` -> `window_size` * improved comments * minor * remove old unused code from algorithm * minor * add power law case to `common_sampler_init`, add sampler name mappings * clarify behaviour when `window_size = 0` * add missing enums * remove `target_range` param, make `target == 1` no-op, cleanup code * oops, straggler * add missing parameters in `server-task.cpp` * copy from author ref: https://gist.github.com/MrJackSpade/9be99c7efbba7b95a41377e123b7b069 * remove old debug log, style nit * fix compiler warning, add commented-out logging per token * re-write + change parameters + simplify * oops forgot args.cpp * fix leftover `window_size` * add missing values to `common_params_sampling::print()` * with logging * does this fix it? * no, but does this? * update default decay * optimize * fix bad merge my git skills are lacking * silence `missing initializer for member` * update default decay to 0.9 * fix logging * format (double) * add power law to the new `samplers` vector * log sampler init values * improve logging messages in llama_sampler_power_law * remove extraneous logging * simplify target computation last commit with debug logging! * remove debug logging, explicitly clamp params at init * add `use_power_law` flag + logic, minor cleanup * update `power-law` -> `adaptive-p` * fix cold start EMA - `ctx->weighted_sum` is now initialized and reset to `target / (1.0f - clamped_decay)` - `ctx->total_weight` is now initialized and reset to `1.0f / (1.0f - clamped_decay)` this fixes a "cold start" problem with the moving average * update `SHARPNESS` constant to `10.0f` * minor style fixes no functional changes * minor style fixes cont. * update `llama_sampler_adaptive_p_i` for backend sampling (ref: #17004) * separate into `apply` + `accept` functions * `pending_token_idx`: switch from `llama_token` to `int32` functionally identical (`llama.h` has `typedef int32_t llama_token;`), but its more correct now * don't transform logits <= -1e9f * fix masking in backend top-p, min-p * address review comments * typo in comments `RND` -> `RNG` * add docs * add recommended values in completion docs * address PR feedback * remove trailing whitespace (for CI `editorconfig`) * add to adaptive-p to `common_sampler_types_from_chars`
This commit is contained in:
+28
-25
@@ -119,6 +119,7 @@ enum common_sampler_type {
|
||||
COMMON_SAMPLER_TYPE_INFILL = 9,
|
||||
COMMON_SAMPLER_TYPE_PENALTIES = 10,
|
||||
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
|
||||
COMMON_SAMPLER_TYPE_ADAPTIVE_P = 12,
|
||||
};
|
||||
|
||||
// dimensionality reduction methods, used by cvector-generator
|
||||
@@ -166,32 +167,34 @@ enum common_params_sampling_config : uint64_t {
|
||||
struct common_params_sampling {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float xtc_probability = 0.00f; // 0.0 = disabled
|
||||
float xtc_threshold = 0.10f; // > 0.5 disables XTC
|
||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
|
||||
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
|
||||
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
|
||||
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float top_n_sigma = -1.00f;// -1.0 = disabled
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float xtc_probability = 0.00f; // 0.0 = disabled
|
||||
float xtc_threshold = 0.10f; // > 0.5 disables XTC
|
||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
float dry_multiplier = 0.0f; // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
|
||||
float dry_base = 1.75f; // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
|
||||
int32_t dry_allowed_length = 2; // tokens extending repetitions beyond this receive penalty
|
||||
int32_t dry_penalty_last_n = -1; // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
|
||||
float adaptive_target = -1.0f; // select tokens near this probability (valid range 0.0 to 1.0; negative = disabled)
|
||||
float adaptive_decay = 0.90f; // EMA decay for adaptation; history ≈ 1/(1-decay) tokens (0.0 - 0.99)
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float top_n_sigma = -1.00f; // -1.0 = disabled
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool ignore_eos = false;
|
||||
bool no_perf = false; // disable performance metrics
|
||||
bool no_perf = false; // disable performance metrics
|
||||
bool timing_per_token = false;
|
||||
|
||||
uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
|
||||
|
||||
Reference in New Issue
Block a user