diff --git a/common/arg.cpp b/common/arg.cpp index 85d84e5cc..943d0766f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -97,8 +97,13 @@ common_arg & common_arg::set_env(const char * env) { return *this; } -common_arg & common_arg::set_sparam() { - is_sparam = true; +common_arg & common_arg::set_sampling() { + is_sampling = true; + return *this; +} + +common_arg & common_arg::set_spec() { + is_spec = true; return *this; } @@ -568,8 +573,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context postprocess_cpu_params(params.cpuparams, nullptr); postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); - postprocess_cpu_params(params.speculative.cpuparams, ¶ms.cpuparams); - postprocess_cpu_params(params.speculative.cpuparams_batch, ¶ms.cpuparams_batch); + postprocess_cpu_params(params.speculative.draft.cpuparams, ¶ms.cpuparams); + postprocess_cpu_params(params.speculative.draft.cpuparams_batch, ¶ms.cpuparams_batch); if (params.prompt_cache_all && (params.interactive || params.interactive_first)) { throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); @@ -591,8 +596,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context break; } } - common_params_handle_model(params.speculative.mparams_dft, params.hf_token, params.offline); - common_params_handle_model(params.vocoder.model, params.hf_token, params.offline); + common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline); + common_params_handle_model(params.vocoder.model, params.hf_token, params.offline); } // model is required (except for server) @@ -611,7 +616,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context for (auto & seq_breaker : params.sampling.dry_sequence_breakers) { string_process_escapes(seq_breaker); } - for (auto & pair : params.speculative.replacements) { + for (auto & pair : params.speculative.draft.replacements) { string_process_escapes(pair.first); string_process_escapes(pair.second); } @@ -628,8 +633,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context params.tensor_buft_overrides.push_back({nullptr, nullptr}); } - if (!params.speculative.tensor_buft_overrides.empty()) { - params.speculative.tensor_buft_overrides.push_back({nullptr, nullptr}); + if (!params.speculative.draft.tensor_buft_overrides.empty()) { + params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr}); } if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) { @@ -651,12 +656,15 @@ static void common_params_print_usage(common_params_context & ctx_arg) { }; std::vector common_options; - std::vector sparam_options; + std::vector sampling_options; + std::vector spec_options; std::vector specific_options; for (auto & opt : ctx_arg.options) { // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example - if (opt.is_sparam) { - sparam_options.push_back(&opt); + if (opt.is_sampling) { + sampling_options.push_back(&opt); + } else if (opt.is_spec) { + spec_options.push_back(&opt); } else if (opt.in_example(ctx_arg.ex)) { specific_options.push_back(&opt); } else { @@ -666,7 +674,9 @@ static void common_params_print_usage(common_params_context & ctx_arg) { printf("----- common params -----\n\n"); print_options(common_options); printf("\n\n----- sampling params -----\n\n"); - print_options(sparam_options); + print_options(sampling_options); + printf("\n\n----- speculative params -----\n\n"); + print_options(spec_options); // TODO: maybe convert enum llama_example to string printf("\n\n----- example-specific params -----\n\n"); print_options(specific_options); @@ -674,12 +684,15 @@ static void common_params_print_usage(common_params_context & ctx_arg) { static void common_params_print_completion(common_params_context & ctx_arg) { std::vector common_options; - std::vector sparam_options; + std::vector sampling_options; + std::vector spec_options; std::vector specific_options; for (auto & opt : ctx_arg.options) { - if (opt.is_sparam) { - sparam_options.push_back(&opt); + if (opt.is_sampling) { + sampling_options.push_back(&opt); + } else if (opt.is_spec) { + spec_options.push_back(&opt); } else if (opt.in_example(ctx_arg.ex)) { specific_options.push_back(&opt); } else { @@ -703,7 +716,8 @@ static void common_params_print_completion(common_params_context & ctx_arg) { }; print_options(common_options); - print_options(sparam_options); + print_options(sampling_options); + print_options(spec_options); print_options(specific_options); printf("\"\n\n"); @@ -1223,14 +1237,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-lcs", "--lookup-cache-static"}, "FNAME", "path to static lookup cache to use for lookup decoding (not updated by generation)", [](common_params & params, const std::string & value) { - params.speculative.lookup_cache_static = value; + params.speculative.ngram_cache.lookup_cache_static = value; } ).set_examples({LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-lcd", "--lookup-cache-dynamic"}, "FNAME", "path to dynamic lookup cache to use for lookup decoding (updated by generation)", [](common_params & params, const std::string & value) { - params.speculative.lookup_cache_dynamic = value; + params.speculative.ngram_cache.lookup_cache_dynamic = value; } ).set_examples({LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( @@ -1576,28 +1590,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.samplers = common_sampler_types_from_names(sampler_names, true); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"-s", "--seed"}, "SEED", string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED), [](common_params & params, const std::string & value) { params.sampling.seed = std::stoul(value); } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--sampler-seq", "--sampling-seq"}, "SEQUENCE", string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), [](common_params & params, const std::string & value) { params.sampling.samplers = common_sampler_types_from_chars(value); } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--ignore-eos"}, "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)", [](common_params & params) { params.sampling.ignore_eos = true; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--temp", "--temperature"}, "N", string_format("temperature (default: %.2f)", (double)params.sampling.temp), @@ -1606,7 +1620,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.temp = std::max(params.sampling.temp, 0.0f); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--top-k"}, "N", string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k), @@ -1614,7 +1628,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.top_k = value; params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K; } - ).set_sparam().set_env("LLAMA_ARG_TOP_K")); + ).set_sampling().set_env("LLAMA_ARG_TOP_K")); add_opt(common_arg( {"--top-p"}, "N", string_format("top-p sampling (default: %.2f, 1.0 = disabled)", (double)params.sampling.top_p), @@ -1622,7 +1636,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.top_p = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--min-p"}, "N", string_format("min-p sampling (default: %.2f, 0.0 = disabled)", (double)params.sampling.min_p), @@ -1630,14 +1644,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.min_p = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--top-nsigma", "--top-n-sigma"}, "N", string_format("top-n-sigma sampling (default: %.2f, -1.0 = disabled)", params.sampling.top_n_sigma), [](common_params & params, const std::string & value) { params.sampling.top_n_sigma = std::stof(value); } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--xtc-probability"}, "N", string_format("xtc probability (default: %.2f, 0.0 = disabled)", (double)params.sampling.xtc_probability), @@ -1645,7 +1659,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.xtc_probability = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--xtc-threshold"}, "N", string_format("xtc threshold (default: %.2f, 1.0 = disabled)", (double)params.sampling.xtc_threshold), @@ -1653,14 +1667,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.xtc_threshold = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--typical", "--typical-p"}, "N", string_format("locally typical sampling, parameter p (default: %.2f, 1.0 = disabled)", (double)params.sampling.typ_p), [](common_params & params, const std::string & value) { params.sampling.typ_p = std::stof(value); } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--repeat-last-n"}, "N", string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n), @@ -1672,7 +1686,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--repeat-penalty"}, "N", string_format("penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)", (double)params.sampling.penalty_repeat), @@ -1680,28 +1694,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.penalty_repeat = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--presence-penalty"}, "N", string_format("repeat alpha presence penalty (default: %.2f, 0.0 = disabled)", (double)params.sampling.penalty_present), [](common_params & params, const std::string & value) { params.sampling.penalty_present = std::stof(value); } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--frequency-penalty"}, "N", string_format("repeat alpha frequency penalty (default: %.2f, 0.0 = disabled)", (double)params.sampling.penalty_freq), [](common_params & params, const std::string & value) { params.sampling.penalty_freq = std::stof(value); } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--dry-multiplier"}, "N", string_format("set DRY sampling multiplier (default: %.2f, 0.0 = disabled)", (double)params.sampling.dry_multiplier), [](common_params & params, const std::string & value) { params.sampling.dry_multiplier = std::stof(value); } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--dry-base"}, "N", string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base), @@ -1712,14 +1726,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.dry_base = potential_base; } } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--dry-allowed-length"}, "N", string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length), [](common_params & params, int value) { params.sampling.dry_allowed_length = value; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--dry-penalty-last-n"}, "N", string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n), @@ -1729,7 +1743,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } params.sampling.dry_penalty_last_n = value; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--dry-sequence-breaker"}, "STRING", string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n", @@ -1755,7 +1769,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.dry_sequence_breakers.emplace_back(value); } } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--adaptive-target"}, "N", string_format("adaptive-p: select tokens near this probability (valid range 0.0 " @@ -1765,7 +1779,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.sampling.adaptive_target = std::stof(value); } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--adaptive-decay"}, "N", string_format("adaptive-p: decay rate for target adaptation over time. lower values " @@ -1775,21 +1789,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.sampling.adaptive_decay = std::stof(value); } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--dynatemp-range"}, "N", string_format("dynamic temperature range (default: %.2f, 0.0 = disabled)", (double)params.sampling.dynatemp_range), [](common_params & params, const std::string & value) { params.sampling.dynatemp_range = std::stof(value); } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--dynatemp-exp"}, "N", string_format("dynamic temperature exponent (default: %.2f)", (double)params.sampling.dynatemp_exponent), [](common_params & params, const std::string & value) { params.sampling.dynatemp_exponent = std::stof(value); } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--mirostat"}, "N", string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n" @@ -1798,7 +1812,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.mirostat = value; params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--mirostat-lr"}, "N", string_format("Mirostat learning rate, parameter eta (default: %.2f)", (double)params.sampling.mirostat_eta), @@ -1806,7 +1820,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.mirostat_eta = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--mirostat-ent"}, "N", string_format("Mirostat target entropy, parameter tau (default: %.2f)", (double)params.sampling.mirostat_tau), @@ -1814,7 +1828,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.mirostat_tau = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n" @@ -1836,28 +1850,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex throw std::invalid_argument("invalid input format"); } } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--grammar"}, "GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir)", [](common_params & params, const std::string & value) { params.sampling.grammar = {COMMON_GRAMMAR_TYPE_USER, value}; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"--grammar-file"}, "FNAME", "file to read grammar from", [](common_params & params, const std::string & value) { params.sampling.grammar = {COMMON_GRAMMAR_TYPE_USER, read_file(value)}; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"-j", "--json-schema"}, "SCHEMA", "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", [](common_params & params, const std::string & value) { params.sampling.grammar = {COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT, json_schema_to_grammar(json::parse(value))}; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"-jf", "--json-schema-file"}, "FILE", "File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead", @@ -1874,14 +1888,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ); params.sampling.grammar = {COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT, json_schema_to_grammar(json::parse(schema))}; } - ).set_sparam()); + ).set_sampling()); add_opt(common_arg( {"-bs", "--backend-sampling"}, "enable backend sampling (experimental) (default: disabled)", [](common_params & params) { params.sampling.backend_sampling = true; } - ).set_sparam().set_env("LLAMA_ARG_BACKEND_SAMPLING")); + ).set_sampling().set_env("LLAMA_ARG_BACKEND_SAMPLING")); add_opt(common_arg( {"--pooling"}, "{none,mean,cls,last,rank}", "pooling type for embeddings, use model default if unspecified", @@ -2283,12 +2297,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex parse_tensor_buffer_overrides(value, params.tensor_buft_overrides); } ).set_env("LLAMA_ARG_OVERRIDE_TENSOR")); - add_opt(common_arg( - {"-otd", "--override-tensor-draft"}, "=,...", - "override tensor buffer type for draft model", [](common_params & params, const std::string & value) { - parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides); - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"-cmoe", "--cpu-moe"}, "keep all Mixture of Experts (MoE) weights in the CPU", @@ -2311,27 +2319,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_env("LLAMA_ARG_N_CPU_MOE")); - add_opt(common_arg( - {"-cmoed", "--cpu-moe-draft"}, - "keep all Mixture of Experts (MoE) weights in the CPU for the draft model", - [](common_params & params) { - params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override()); - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT")); - add_opt(common_arg( - {"-ncmoed", "--n-cpu-moe-draft"}, "N", - "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model", - [](common_params & params, int value) { - if (value < 0) { - throw std::invalid_argument("invalid value"); - } - for (int i = 0; i < value; ++i) { - static std::list buft_overrides_draft; - buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i)); - params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()}); - } - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT")); GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0 add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", @@ -2614,13 +2601,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.model.hf_repo = value; } ).set_env("LLAMA_ARG_HF_REPO")); - add_opt(common_arg( - {"-hfd", "-hfrd", "--hf-repo-draft"}, "/[:quant]", - "Same as --hf-repo, but for the draft model (default: unused)", - [](common_params & params, const std::string & value) { - params.speculative.mparams_dft.hf_repo = value; - } - ).set_env("LLAMA_ARG_HFD_REPO")); add_opt(common_arg( {"-hff", "--hf-file"}, "FILE", "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)", @@ -3330,170 +3310,235 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_LOG_TIMESTAMPS")); + // // speculative parameters + // + add_opt(common_arg( - {"-td", "--threads-draft"}, "N", + {"--spec-draft-hf", "-hfd", "-hfrd", "--hf-repo-draft"}, "/[:quant]", + "Same as --hf-repo, but for the draft model (default: unused)", + [](common_params & params, const std::string & value) { + params.speculative.draft.mparams.hf_repo = value; + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_HF_REPO")); + add_opt(common_arg( + {"--spec-draft-threads", "-td", "--threads-draft"}, "N", "number of threads to use during generation (default: same as --threads)", [](common_params & params, int value) { - params.speculative.cpuparams.n_threads = value; - if (params.speculative.cpuparams.n_threads <= 0) { - params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency(); + params.speculative.draft.cpuparams.n_threads = value; + if (params.speculative.draft.cpuparams.n_threads <= 0) { + params.speculative.draft.cpuparams.n_threads = std::thread::hardware_concurrency(); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"-tbd", "--threads-batch-draft"}, "N", + {"--spec-draft-threads-batch", "-tbd", "--threads-batch-draft"}, "N", "number of threads to use during batch and prompt processing (default: same as --threads-draft)", [](common_params & params, int value) { - params.speculative.cpuparams_batch.n_threads = value; - if (params.speculative.cpuparams_batch.n_threads <= 0) { - params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); + params.speculative.draft.cpuparams_batch.n_threads = value; + if (params.speculative.draft.cpuparams_batch.n_threads <= 0) { + params.speculative.draft.cpuparams_batch.n_threads = std::thread::hardware_concurrency(); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"-Cd", "--cpu-mask-draft"}, "M", + {"--spec-draft-cpu-mask", "-Cd", "--cpu-mask-draft"}, "M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", [](common_params & params, const std::string & mask) { - params.speculative.cpuparams.mask_valid = true; - if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) { + params.speculative.draft.cpuparams.mask_valid = true; + if (!parse_cpu_mask(mask, params.speculative.draft.cpuparams.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"-Crd", "--cpu-range-draft"}, "lo-hi", + {"--spec-draft-cpu-range", "-Crd", "--cpu-range-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft", [](common_params & params, const std::string & range) { - params.speculative.cpuparams.mask_valid = true; - if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) { + params.speculative.draft.cpuparams.mask_valid = true; + if (!parse_cpu_range(range, params.speculative.draft.cpuparams.cpumask)) { throw std::invalid_argument("invalid range"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"--cpu-strict-draft"}, "<0|1>", + {"--spec-draft-cpu-strict", "--cpu-strict-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: same as --cpu-strict)", [](common_params & params, int value) { - params.speculative.cpuparams.strict_cpu = value; + params.speculative.draft.cpuparams.strict_cpu = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"--prio-draft"}, "N", - string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority), + {"--spec-draft-prio", "--prio-draft"}, "N", + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.draft.cpuparams.priority), [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } - params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio; + params.speculative.draft.cpuparams.priority = (enum ggml_sched_priority) prio; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"--poll-draft"}, "<0|1>", + {"--spec-draft-poll", "--poll-draft"}, "<0|1>", "Use polling to wait for draft model work (default: same as --poll])", [](common_params & params, int value) { - params.speculative.cpuparams.poll = value; + params.speculative.draft.cpuparams.poll = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"-Cbd", "--cpu-mask-batch-draft"}, "M", + {"--spec-draft-cpu-mask-batch", "-Cbd", "--cpu-mask-batch-draft"}, "M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)", [](common_params & params, const std::string & mask) { - params.speculative.cpuparams_batch.mask_valid = true; - if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) { + params.speculative.draft.cpuparams_batch.mask_valid = true; + if (!parse_cpu_mask(mask, params.speculative.draft.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi", + {"--spec-draft-cpu-range-batch", "-Crbd", "--cpu-range-batch-draft"}, "lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)", [](common_params & params, const std::string & range) { - params.speculative.cpuparams_batch.mask_valid = true; - if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) { + params.speculative.draft.cpuparams_batch.mask_valid = true; + if (!parse_cpu_range(range, params.speculative.draft.cpuparams_batch.cpumask)) { throw std::invalid_argument("invalid cpumask"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(common_arg( - {"--cpu-strict-batch-draft"}, "<0|1>", + {"--spec-draft-cpu-strict-batch", "--cpu-strict-batch-draft"}, "<0|1>", "Use strict CPU placement for draft model (default: --cpu-strict-draft)", [](common_params & params, int value) { - params.speculative.cpuparams_batch.strict_cpu = value; + params.speculative.draft.cpuparams_batch.strict_cpu = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"--prio-batch-draft"}, "N", - string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority), + {"--spec-draft-prio-batch", "--prio-batch-draft"}, "N", + string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.draft.cpuparams_batch.priority), [](common_params & params, int prio) { if (prio < 0 || prio > 3) { throw std::invalid_argument("invalid value"); } - params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio; + params.speculative.draft.cpuparams_batch.priority = (enum ggml_sched_priority) prio; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"--poll-batch-draft"}, "<0|1>", + {"--spec-draft-poll-batch", "--poll-batch-draft"}, "<0|1>", "Use polling to wait for draft model work (default: --poll-draft)", [](common_params & params, int value) { - params.speculative.cpuparams_batch.poll = value; + params.speculative.draft.cpuparams_batch.poll = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"--draft", "--draft-n", "--draft-max"}, "N", - string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max), - [](common_params & params, int value) { - params.speculative.n_max = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX")); - add_opt(common_arg( - {"--draft-min", "--draft-n-min"}, "N", - string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), - [](common_params & params, int value) { - params.speculative.n_min = value; - } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN")); - add_opt(common_arg( - {"--draft-p-split"}, "P", - string_format("speculative decoding split probability (default: %.2f)", (double)params.speculative.p_split), + {"--spec-draft-type-k", "-ctkd", "--cache-type-k-draft"}, "TYPE", + string_format( + "KV cache data type for K for the draft model\n" + "allowed values: %s\n" + "(default: %s)", + get_all_kv_cache_types().c_str(), + ggml_type_name(params.speculative.draft.cache_type_k) + ), [](common_params & params, const std::string & value) { - params.speculative.p_split = std::stof(value); + params.speculative.draft.cache_type_k = kv_cache_type_from_str(value); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT")); + ).set_env("LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K")); add_opt(common_arg( - {"--draft-p-min"}, "P", - string_format("minimum speculative decoding probability (greedy) (default: %.2f)", (double)params.speculative.p_min), + {"--spec-draft-type-v", "-ctvd", "--cache-type-v-draft"}, "TYPE", + string_format( + "KV cache data type for V for the draft model\n" + "allowed values: %s\n" + "(default: %s)", + get_all_kv_cache_types().c_str(), + ggml_type_name(params.speculative.draft.cache_type_v) + ), [](common_params & params, const std::string & value) { - params.speculative.p_min = std::stof(value); + params.speculative.draft.cache_type_v = kv_cache_type_from_str(value); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN")); + ).set_env("LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_V")); add_opt(common_arg( - {"-cd", "--ctx-size-draft"}, "N", - string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx), + {"--spec-draft-override-tensor", "-otd", "--override-tensor-draft"}, "=,...", + "override tensor buffer type for draft model", [](common_params & params, const std::string & value) { + parse_tensor_buffer_overrides(value, params.speculative.draft.tensor_buft_overrides); + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + add_opt(common_arg( + {"--spec-draft-cpu-moe", "-cmoed", "--cpu-moe-draft"}, + "keep all Mixture of Experts (MoE) weights in the CPU for the draft model", + [](common_params & params) { + params.speculative.draft.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override()); + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_CPU_MOE")); + add_opt(common_arg( + {"--spec-draft-n-cpu-moe", "--spec-draft-ncmoe", "-ncmoed", "--n-cpu-moe-draft"}, "N", + "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model", [](common_params & params, int value) { - params.speculative.n_ctx = value; + if (value < 0) { + throw std::invalid_argument("invalid value"); + } + for (int i = 0; i < value; ++i) { + static std::list buft_overrides_draft; + buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i)); + params.speculative.draft.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()}); + } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT")); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE")); + add_opt(common_arg( - {"-devd", "--device-draft"}, "", + {"--spec-draft-n-max"}, "N", + string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.draft.n_max), + [](common_params & params, int value) { + params.speculative.draft.n_max = value; + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_N_MAX")); + add_opt(common_arg( + {"--spec-draft-n-min"}, "N", + string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.draft.n_min), + [](common_params & params, int value) { + params.speculative.draft.n_min = value; + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_N_MIN")); + + add_opt(common_arg( + {"--spec--draft-p-split", "--draft-p-split"}, "P", + string_format("speculative decoding split probability (default: %.2f)", (double)params.speculative.draft.p_split), + [](common_params & params, const std::string & value) { + params.speculative.draft.p_split = std::stof(value); + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_SPLIT")); + add_opt(common_arg( + {"--spec-draft-p-min", "--draft-p-min"}, "P", + string_format("minimum speculative decoding probability (greedy) (default: %.2f)", (double)params.speculative.draft.p_min), + [](common_params & params, const std::string & value) { + params.speculative.draft.p_min = std::stof(value); + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN")); + add_opt(common_arg( + {"--spec-draft-ctx-size", "-cd", "--ctx-size-draft"}, "N", + string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.draft.n_ctx), + [](common_params & params, int value) { + params.speculative.draft.n_ctx = value; + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_CTX_SIZE")); + add_opt(common_arg( + {"--spec-draft-device", "-devd", "--device-draft"}, "", "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" "use --list-devices to see a list of available devices", [](common_params & params, const std::string & value) { - params.speculative.devices = parse_device_list(value); + params.speculative.draft.devices = parse_device_list(value); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); - GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0 + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + GGML_ASSERT(params.speculative.draft.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0 add_opt(common_arg( - {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", + {"--spec-draft-ngl", "-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", - params.speculative.n_gpu_layers == -1 ? "auto" : "all"), + params.speculative.draft.n_gpu_layers == -1 ? "auto" : "all"), [](common_params & params, const std::string & value) { if (value == "auto") { - params.speculative.n_gpu_layers = -1; + params.speculative.draft.n_gpu_layers = -1; } else if (value == "all") { - params.speculative.n_gpu_layers = -2; + params.speculative.draft.n_gpu_layers = -2; } else { - params.speculative.n_gpu_layers = std::stoi(value); + params.speculative.draft.n_gpu_layers = std::stoi(value); } if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n"); @@ -3501,21 +3546,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT")); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT")); add_opt(common_arg( - {"-md", "--model-draft"}, "FNAME", + {"--spec-draft-model", "-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", [](common_params & params, const std::string & value) { - params.speculative.mparams_dft.path = value; + params.speculative.draft.mparams.path = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT")); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL")); add_opt(common_arg( - {"--spec-replace"}, "TARGET", "DRAFT", + {"--spec-draft-replace", "--spec-replace"}, "TARGET", "DRAFT", "translate the string in TARGET into DRAFT if the draft model and main model are not compatible", [](common_params & params, const std::string & tgt, const std::string & dft) { - params.speculative.replacements.push_back({ tgt, dft }); + params.speculative.draft.replacements.push_back({ tgt, dft }); } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]", string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n", @@ -3537,63 +3582,174 @@ common_params_context common_params_parser_init(common_params & params, llama_ex throw std::invalid_argument("unknown speculative decoding type without draft model"); } } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SPEC_TYPE")); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_TYPE")); add_opt(common_arg( - {"--spec-ngram-size-n"}, "N", - string_format("ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: %d)", params.speculative.ngram_size_n), + {"--spec-ngram-mod-n-min"}, "N", + string_format("minimum number of ngram tokens to use for ngram-based speculative decoding (default: %d)", params.speculative.ngram_mod.n_min), + [](common_params & params, int value) { + if (value < 0 || value > 1024) { + throw std::invalid_argument("ngram n-min must be between 0 and 1024 inclusive"); + } + params.speculative.ngram_mod.n_min = value; + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + add_opt(common_arg( + {"--spec-ngram-mod-n-max"}, "N", + string_format("maximum number of ngram tokens to use for ngram-based speculative decoding (default: %d)", params.speculative.ngram_mod.n_max), + [](common_params & params, int value) { + if (value < 0 || value > 1024) { + throw std::invalid_argument("ngram n-max must be between 0 and 1024 inclusive"); + } + params.speculative.ngram_mod.n_max = value; + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + add_opt(common_arg( + {"--spec-ngram-mod-n-match"}, "N", + string_format("ngram-mod lookup length (default: %d)", params.speculative.ngram_mod.n_match), [](common_params & params, int value) { if (value < 1 || value > 1024) { throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive"); } - params.speculative.ngram_size_n = value; + params.speculative.ngram_mod.n_match = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + add_opt(common_arg( - {"--spec-ngram-size-m"}, "N", - string_format("ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: %d)", params.speculative.ngram_size_m), + {"--spec-ngram-simple-size-n"}, "N", + string_format("ngram size N for ngram-simple speculative decoding, length of lookup n-gram (default: %d)", params.speculative.ngram_simple.size_n), + [](common_params & params, int value) { + if (value < 1 || value > 1024) { + throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive"); + } + params.speculative.ngram_simple.size_n = value; + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + add_opt(common_arg( + {"--spec-ngram-simple-size-m"}, "N", + string_format("ngram size M for ngram-simple speculative decoding, length of draft m-gram (default: %d)", params.speculative.ngram_simple.size_m), [](common_params & params, int value) { if (value < 1 || value > 1024) { throw std::invalid_argument("ngram size M must be between 1 and 1024 inclusive"); } - params.speculative.ngram_size_m = value; + params.speculative.ngram_simple.size_m = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"--spec-ngram-min-hits"}, "N", - string_format("minimum hits for ngram-map speculative decoding (default: %d)", params.speculative.ngram_min_hits), + {"--spec-ngram-simple-min-hits"}, "N", + string_format("minimum hits for ngram-simple speculative decoding (default: %d)", params.speculative.ngram_simple.min_hits), [](common_params & params, int value) { if (value < 1) { throw std::invalid_argument("ngram min hits must be at least 1"); } - params.speculative.ngram_min_hits = value; + params.speculative.ngram_simple.min_hits = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + add_opt(common_arg( - {"-ctkd", "--cache-type-k-draft"}, "TYPE", - string_format( - "KV cache data type for K for the draft model\n" - "allowed values: %s\n" - "(default: %s)", - get_all_kv_cache_types().c_str(), - ggml_type_name(params.speculative.cache_type_k) - ), - [](common_params & params, const std::string & value) { - params.speculative.cache_type_k = kv_cache_type_from_str(value); + {"--spec-ngram-map-k-size-n"}, "N", + string_format("ngram size N for ngram-map-k speculative decoding, length of lookup n-gram (default: %d)", params.speculative.ngram_map_k.size_n), + [](common_params & params, int value) { + if (value < 1 || value > 1024) { + throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive"); + } + params.speculative.ngram_map_k.size_n = value; } - ).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT")); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"-ctvd", "--cache-type-v-draft"}, "TYPE", - string_format( - "KV cache data type for V for the draft model\n" - "allowed values: %s\n" - "(default: %s)", - get_all_kv_cache_types().c_str(), - ggml_type_name(params.speculative.cache_type_v) - ), - [](common_params & params, const std::string & value) { - params.speculative.cache_type_v = kv_cache_type_from_str(value); + {"--spec-ngram-map-k-size-m"}, "N", + string_format("ngram size M for ngram-map-k speculative decoding, length of draft m-gram (default: %d)", params.speculative.ngram_map_k.size_m), + [](common_params & params, int value) { + if (value < 1 || value > 1024) { + throw std::invalid_argument("ngram size M must be between 1 and 1024 inclusive"); + } + params.speculative.ngram_map_k.size_m = value; } - ).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT")); + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + add_opt(common_arg( + {"--spec-ngram-map-k-min-hits"}, "N", + string_format("minimum hits for ngram-map-k speculative decoding (default: %d)", params.speculative.ngram_map_k.min_hits), + [](common_params & params, int value) { + if (value < 1) { + throw std::invalid_argument("ngram min hits must be at least 1"); + } + params.speculative.ngram_map_k.min_hits = value; + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + + add_opt(common_arg( + {"--spec-ngram-map-k4v-size-n"}, "N", + string_format("ngram size N for ngram-map-k4v speculative decoding, length of lookup n-gram (default: %d)", params.speculative.ngram_map_k4v.size_n), + [](common_params & params, int value) { + if (value < 1 || value > 1024) { + throw std::invalid_argument("ngram size N must be between 1 and 1024 inclusive"); + } + params.speculative.ngram_map_k4v.size_n = value; + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + add_opt(common_arg( + {"--spec-ngram-map-k4v-size-m"}, "N", + string_format("ngram size M for ngram-map-k4v speculative decoding, length of draft m-gram (default: %d)", params.speculative.ngram_map_k4v.size_m), + [](common_params & params, int value) { + if (value < 1 || value > 1024) { + throw std::invalid_argument("ngram size M must be between 1 and 1024 inclusive"); + } + params.speculative.ngram_map_k4v.size_m = value; + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + add_opt(common_arg( + {"--spec-ngram-map-k4v-min-hits"}, "N", + string_format("minimum hits for ngram-map-k4v speculative decoding (default: %d)", params.speculative.ngram_map_k4v.min_hits), + [](common_params & params, int value) { + if (value < 1) { + throw std::invalid_argument("ngram min hits must be at least 1"); + } + params.speculative.ngram_map_k4v.min_hits = value; + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); + + // + // removed params + // + + add_opt(common_arg( + {"--draft", "--draft-n", "--draft-max"}, "N", + "the argument has been removed. use --spec-draft-n-max or --spec-ngram-mod-n-max", + [](common_params & /*params*/, int /*value*/) { + throw std::invalid_argument("the argument has been removed. use --spec-draft-n-max or --spec-ngram-mod-n-max"); + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX")); + add_opt(common_arg( + {"--draft-min", "--draft-n-min"}, "N", + "the argument has been removed. use --spec-draft-n-min or --spec-ngram-mod-n-min", + [](common_params & /*params*/, int /*value*/) { + throw std::invalid_argument("the argument has been removed. use --spec-draft-n-min or --spec-ngram-mod-n-min"); + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN")); + add_opt(common_arg( + {"--spec-ngram-size-n"}, "N", + "the argument has been removed. use the respective --spec-ngram-*-size-n or --spec-ngram-mod-n-match", + [](common_params & /*params*/, int /*value*/) { + throw std::invalid_argument("the argument has been removed. use the respective --spec-ngram-*-size-n"); + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--spec-ngram-size-m"}, "N", + "the argument has been removed. use the respective --spec-ngram-*-size-m", + [](common_params & /*params*/, int /*value*/) { + throw std::invalid_argument("the argument has been removed. use the respective --spec-ngram-*-size-m"); + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--spec-ngram-min-hits"}, "N", + "the argument has been removed. use the respective --spec-ngram-*-min-hits", + [](common_params & /*params*/, int /*value*/) { + throw std::invalid_argument("the argument has been removed. use the respective --spec-ngram-*-min-hits"); + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SERVER})); + + // + // TTS params + // add_opt(common_arg( {"-mv", "--model-vocoder"}, "FNAME", @@ -3617,6 +3773,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_TTS})); + // + // diffusion params + // + add_opt(common_arg( {"--diffusion-steps"}, "N", string_format("number of diffusion steps (default: %d)", params.diffusion.steps), @@ -3801,8 +3961,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"; params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf"; - params.speculative.mparams_dft.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; - params.speculative.mparams_dft.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; + params.speculative.draft.mparams.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; + params.speculative.draft.mparams.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; params.port = 8012; params.n_ubatch = 1024; params.n_batch = 1024; @@ -3817,8 +3977,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF"; params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf"; - params.speculative.mparams_dft.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; - params.speculative.mparams_dft.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; + params.speculative.draft.mparams.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; + params.speculative.draft.mparams.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; params.port = 8012; params.n_ubatch = 1024; params.n_batch = 1024; @@ -3907,9 +4067,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex string_format("enable default speculative decoding config"), [](common_params & params) { params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD; - params.speculative.ngram_size_n = 24; - params.speculative.n_min = 48; - params.speculative.n_max = 64; + params.speculative.ngram_mod.n_match = 24; + params.speculative.ngram_mod.n_min = 48; + params.speculative.ngram_mod.n_max = 64; } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); diff --git a/common/arg.h b/common/arg.h index 55782a158..2c2a4e38a 100644 --- a/common/arg.h +++ b/common/arg.h @@ -25,7 +25,8 @@ struct common_arg { const char * value_hint_2 = nullptr; // for second arg value const char * env = nullptr; std::string help; - bool is_sparam = false; // is current arg a sampling param? + bool is_sampling = false; // is current arg a sampling param? + bool is_spec = false; // is current arg a speculative decoding param? bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg) void (*handler_void) (common_params & params) = nullptr; void (*handler_string) (common_params & params, const std::string &) = nullptr; @@ -74,7 +75,8 @@ struct common_arg { common_arg & set_examples(std::initializer_list examples); common_arg & set_excludes(std::initializer_list excludes); common_arg & set_env(const char * env); - common_arg & set_sparam(); + common_arg & set_sampling(); + common_arg & set_spec(); common_arg & set_preset_only(); bool in_example(enum llama_example ex); bool is_exclude(enum llama_example ex); diff --git a/common/common.cpp b/common/common.cpp index f7f33e817..793b8fee7 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -70,7 +70,7 @@ common_time_meas::~common_time_meas() { // CPU utils // -int32_t cpu_get_num_physical_cores() { +int32_t common_cpu_get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores std::unordered_set siblings; @@ -185,11 +185,11 @@ static int cpu_count_math_cpus(int n_cpu) { /** * Returns number of CPUs on system that are useful for math. */ -int32_t cpu_get_num_math() { +int32_t common_cpu_get_num_math() { #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__) int n_cpu = sysconf(_SC_NPROCESSORS_ONLN); if (n_cpu < 1) { - return cpu_get_num_physical_cores(); + return common_cpu_get_num_physical_cores(); } if (is_hybrid_cpu()) { cpu_set_t affinity; @@ -202,7 +202,7 @@ int32_t cpu_get_num_math() { } } #endif - return cpu_get_num_physical_cores(); + return common_cpu_get_num_physical_cores(); } // Helper for setting process priority @@ -263,7 +263,7 @@ bool set_process_priority(enum ggml_sched_priority prio) { // -void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) { +void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model) { int32_t n_set = 0; if (cpuparams.n_threads < 0) { @@ -271,7 +271,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) if (role_model != nullptr) { cpuparams = *role_model; } else { - cpuparams.n_threads = cpu_get_num_math(); + cpuparams.n_threads = common_cpu_get_num_math(); } } @@ -1521,7 +1521,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & return cparams; } -struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) { +struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params) { struct ggml_threadpool_params tpp; ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults diff --git a/common/common.h b/common/common.h index d2d3c1061..a564b3b8c 100644 --- a/common/common.h +++ b/common/common.h @@ -54,7 +54,7 @@ struct common_control_vector_load_info; // CPU utils // -struct cpu_params { +struct common_cpu_params { int n_threads = -1; bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. bool mask_valid = false; // Default: any CPU @@ -63,8 +63,8 @@ struct cpu_params { uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) }; -int32_t cpu_get_num_physical_cores(); -int32_t cpu_get_num_math(); +int32_t common_cpu_get_num_physical_cores(); +int32_t common_cpu_get_num_math(); // // Common params @@ -297,34 +297,19 @@ struct common_params_model { struct common_ngram_mod; -struct common_params_speculative { - common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding +// draft-model-based speculative decoding parameters +struct common_params_speculative_draft { + int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding + int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding - // general-purpose speculative decoding parameters + float p_split = 0.1f; // speculative decoding split probability + float p_min = 0.75f; // minimum speculative decoding probability (greedy) - int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding - int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding - float p_split = 0.1f; // speculative decoding split probability - float p_min = 0.75f; // minimum speculative decoding probability (greedy) + common_params_model mparams; - // ngram-based speculative decoding + llama_model * model = nullptr; // a llama_model that can be shared by multiple speculative contexts - uint16_t ngram_size_n = 12; // ngram size for lookup - uint16_t ngram_size_m = 48; // mgram size for speculative tokens - uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed - - std::shared_ptr ngram_mod; - - std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT - std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT - - // draft-model speculative decoding - - struct common_params_model mparams_dft; - - llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts - - llama_context_params cparams_dft; // these are the parameters for the draft llama_context + llama_context_params cparams; // these are the parameters for the draft llama_context int32_t n_ctx = 0; // draft context size int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default) @@ -332,25 +317,60 @@ struct common_params_speculative { ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V - struct cpu_params cpuparams; - struct cpu_params cpuparams_batch; + common_cpu_params cpuparams; + common_cpu_params cpuparams_batch; std::vector devices; // devices to use for offloading std::vector> replacements; // main to speculative model replacements std::vector tensor_buft_overrides; +}; + +struct common_params_speculative_ngram_mod { + int32_t n_match = 24; + + int32_t n_max = 64; + int32_t n_min = 48; + + // shared instance of the ngram container for all speculative decoding contexts + std::shared_ptr obj; +}; + +struct common_params_speculative_ngram_map { + uint16_t size_n = 12; // ngram size for lookup + uint16_t size_m = 48; // mgram size for speculative tokens + uint16_t min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed +}; + +struct common_params_speculative_ngram_cache { + std::string lookup_cache_static; // path of static ngram cache file for lookup decoding + std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding +}; + +struct common_params_speculative { + // TODO: become a vector in order to support "chains of speculators" + common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; + + common_params_speculative_draft draft; + + common_params_speculative_ngram_mod ngram_mod; + common_params_speculative_ngram_map ngram_simple; + common_params_speculative_ngram_map ngram_map_k; + common_params_speculative_ngram_map ngram_map_k4v; + + common_params_speculative_ngram_cache ngram_cache; bool has_dft() const { - return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty(); + return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty(); } }; struct common_params_vocoder { struct common_params_model model; - std::string speaker_file = ""; // speaker file path // NOLINT + std::string speaker_file; // speaker file path - bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT + bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy }; struct common_params_diffusion { @@ -433,8 +453,8 @@ struct common_params { enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs - struct cpu_params cpuparams; - struct cpu_params cpuparams_batch; + common_cpu_params cpuparams; + common_cpu_params cpuparams_batch; ggml_backend_sched_eval_callback cb_eval = nullptr; void * cb_eval_user_data = nullptr; @@ -678,7 +698,7 @@ std::string common_params_get_system_info(const common_params & params); bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]); bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]); -void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr); +void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model = nullptr); bool set_process_priority(enum ggml_sched_priority prio); // @@ -846,7 +866,7 @@ common_init_result_ptr common_init_from_params(common_params & params); struct llama_model_params common_model_params_to_llama ( common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params); -struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); +struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params); // clear LoRA adapters from context, then apply new list of adapters void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora); diff --git a/common/preset.cpp b/common/preset.cpp index 57ccd000b..9187a67f0 100644 --- a/common/preset.cpp +++ b/common/preset.cpp @@ -43,7 +43,7 @@ static std::set get_remote_preset_whitelist(const std::mapbatch; @@ -309,7 +314,7 @@ struct common_speculative_state_draft : public common_speculative_state { int reuse_i = 0; // index of part to be reused in prompt_dft int reuse_n = 0; // length of part to be reused in prompt_dft - const int n_ctx = llama_n_ctx(ctx_dft) - params.n_max; + const int n_ctx = llama_n_ctx(ctx_dft) - sparams.n_max; llama_tokens prompt_cnv; if (!spec->vocab_cmpt) { @@ -367,7 +372,7 @@ struct common_speculative_state_draft : public common_speculative_state { } result.clear(); - result.reserve(params.n_max); + result.reserve(sparams.n_max); bool needs_ckpt = use_ckpt && prompt_dft.size() > 0; if (reuse_n == 0 || (use_ckpt && reuse_i > 0)) { @@ -380,7 +385,7 @@ struct common_speculative_state_draft : public common_speculative_state { for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) { result.push_back(prompt_dft[i]); - if (params.n_max <= (int) result.size()) { + if (sparams.n_max <= (int) result.size()) { break; } } @@ -473,7 +478,7 @@ struct common_speculative_state_draft : public common_speculative_state { common_sampler_reset(smpl); // sample n_draft tokens from the draft model - for (int i = 0; i < params.n_max; ++i) { + for (int i = 0; i < sparams.n_max; ++i) { common_batch_clear(batch); common_sampler_sample(smpl, ctx_dft, 0, true); @@ -492,12 +497,12 @@ struct common_speculative_state_draft : public common_speculative_state { result.push_back(id); - if (params.n_max <= (int) result.size()) { + if (sparams.n_max <= (int) result.size()) { break; } // only collect very high-confidence draft tokens - if (cur_p->data[0].p < params.p_min) { + if (cur_p->data[0].p < sparams.p_min) { break; } @@ -518,10 +523,14 @@ struct common_speculative_state_draft : public common_speculative_state { detokenized = replace_to_tgt(detokenized); LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str()); result = common_tokenize(ctx_tgt, detokenized, false, true); - if (result.size() > (size_t)params.n_max) { - result.resize(params.n_max); + if (result.size() > (size_t) sparams.n_max) { + result.resize(sparams.n_max); } } + + if (result.size() < (size_t) sparams.n_min) { + result.clear(); + } } void accept(uint16_t n_accepted) override { @@ -529,6 +538,14 @@ struct common_speculative_state_draft : public common_speculative_state { GGML_UNUSED(n_accepted); } + int32_t n_max(const common_params_speculative & params) const override { + return params.draft.n_max; + } + + int32_t n_min(const common_params_speculative & params) const override { + return params.draft.n_min; + } + std::string replace_to_dft(const std::string & input) const { std::string result = input; @@ -581,6 +598,14 @@ struct common_speculative_state_eagle3 : public common_speculative_state { // noop GGML_UNUSED(n_accepted); } + + int32_t n_max(const common_params_speculative & params) const override { + return params.draft.n_max; + } + + int32_t n_min(const common_params_speculative & params) const override { + return params.draft.n_min; + } }; // state of self-speculation (simple implementation, not ngram-map) @@ -610,19 +635,27 @@ struct common_speculative_state_ngram_simple : public common_speculative_state { // noop GGML_UNUSED(n_accepted); } + + int32_t n_max(const common_params_speculative & /*params*/) const override { + return config.size_mgram; + } + + int32_t n_min(const common_params_speculative & /*params*/) const override { + return config.size_mgram; + } }; struct common_speculative_state_ngram_map_k : public common_speculative_state { // draft ngram map for speculative decoding without draft model - common_ngram_map map; + common_ngram_map config; common_speculative_state_ngram_map_k( enum common_speculative_type type, - common_ngram_map map) - : common_speculative_state(type), map(std::move(map)) {} + common_ngram_map config) + : common_speculative_state(type), config(std::move(config)) {} void begin(const llama_tokens & prompt) override { - common_ngram_map_begin(map, prompt); + common_ngram_map_begin(config, prompt); } void draft( @@ -630,12 +663,20 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state { const llama_tokens & prompt_tgt, llama_token id_last, llama_tokens & result) override { - common_ngram_map_draft(map, prompt_tgt, id_last, result); + common_ngram_map_draft(config, prompt_tgt, id_last, result); GGML_UNUSED(params); } void accept(uint16_t n_accepted) override { - common_ngram_map_accept(map, n_accepted); + common_ngram_map_accept(config, n_accepted); + } + + int32_t n_max(const common_params_speculative & /*params*/) const override { + return config.size_value; + } + + int32_t n_min(const common_params_speculative & /*params*/) const override { + return config.size_value; } }; @@ -692,7 +733,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state { const llama_tokens & prompt_tgt, llama_token id_last, llama_tokens & result) override { - GGML_UNUSED(params); + const auto & sparams = params.ngram_mod; n_draft_last = 0; @@ -712,16 +753,16 @@ struct common_speculative_state_ngram_mod : public common_speculative_state { i_last = cur_len - n; } - result.resize(n + params.n_max); + result.resize(n + sparams.n_max); for (size_t i = 0; i < n - 1; ++i) { result[i] = prompt_tgt[cur_len - n + 1 + i]; } result[n - 1] = id_last; - for (int i = 0; i < params.n_max; ++i) { + for (int i = 0; i < sparams.n_max; ++i) { const llama_token token = mod.get(result.data() + i); if (token == common_ngram_mod::EMPTY) { - if (i < params.n_min) { + if (i < sparams.n_min) { result.clear(); return; } @@ -764,6 +805,14 @@ struct common_speculative_state_ngram_mod : public common_speculative_state { } } } + + int32_t n_max(const common_params_speculative & params) const override { + return params.ngram_mod.n_max; + } + + int32_t n_min(const common_params_speculative & params) const override { + return params.ngram_mod.n_min; + } }; struct common_speculative_state_ngram_cache : public common_speculative_state { @@ -857,6 +906,14 @@ struct common_speculative_state_ngram_cache : public common_speculative_state { // TODO: noop GGML_UNUSED(n_accepted); } + + int32_t n_max(const common_params_speculative & /*params*/) const override { + return n_draft; + } + + int32_t n_min(const common_params_speculative & /*params*/) const override { + return 0; + } }; struct common_speculative { @@ -865,11 +922,13 @@ struct common_speculative { common_speculative_state * curr_impl = nullptr; // current implementation in use (for stats) }; -static common_ngram_map get_common_ngram_map(const common_speculative_config & config) { - uint16_t size_key = config.params.ngram_size_n; - uint16_t size_value = config.params.ngram_size_m; - bool key_only = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K); - uint16_t min_hits = config.params.ngram_min_hits; +static common_ngram_map get_common_ngram_map( + common_speculative_type type, + const common_params_speculative_ngram_map & config) { + uint16_t size_key = config.size_n; + uint16_t size_value = config.size_m; + bool key_only = type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K; + uint16_t min_hits = config.min_hits; return common_ngram_map(size_key, size_value, key_only, min_hits); } @@ -927,8 +986,8 @@ common_speculative * common_speculative_init( common_params_speculative & params, llama_context * ctx_tgt) { llama_context * ctx_dft = nullptr; - if (params.model_dft) { - ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft); + if (params.draft.model) { + ctx_dft = llama_init_from_model(params.draft.model, params.draft.cparams); if (ctx_dft == nullptr) { LOG_ERR("%s", "failed to create draft context\n"); return nullptr; @@ -938,7 +997,7 @@ common_speculative * common_speculative_init( // Compute the implementations to use based on the config and their order of preference std::vector configs = {}; // list of speculative configs to try { - bool has_draft = !params.mparams_dft.path.empty(); + bool has_draft = !params.draft.mparams.path.empty(); bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3 bool has_ngram_cache = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_CACHE); @@ -961,16 +1020,17 @@ common_speculative * common_speculative_init( configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params)); } if (has_ngram_mod) { - // shared instance for all speculative decoding contexts - if (!params.ngram_mod) { - params.ngram_mod = std::make_shared(params.ngram_size_n, 4*1024*1024); + auto & sparams = params.ngram_mod; - LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__, - params.ngram_size_n, params.ngram_mod->size(), - (float)(params.ngram_mod->size_bytes())/1024/1024); + if (!sparams.obj) { + sparams.obj = std::make_shared(sparams.n_match, 4*1024*1024); - if (params.ngram_size_n < 16) { - LOG_WRN("%s: ngram_mod n=%d is too small - poor quality is possible, see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, params.ngram_size_n); + LOG_INF("%s: initialized ngram_mod with n_match=%d, size=%zu (%.3f MB)\n", __func__, + sparams.n_match, sparams.obj->size(), (float)(sparams.obj->size_bytes())/1024/1024); + + if (sparams.n_match < 16) { + LOG_WRN("%s: ngram_mod n_match=%d is too small - poor quality is possible, " + "see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, sparams.n_match); } } @@ -1000,7 +1060,7 @@ common_speculative * common_speculative_init( impls.push_back(std::make_unique(config.type, /* .ctx_tgt = */ ctx_tgt, /* .ctx_dft = */ ctx_dft, - /* .replacements = */ params.replacements, + /* .replacements = */ params.draft.replacements, /* .use_ckpt = */ use_ckpt )); break; @@ -1010,18 +1070,18 @@ common_speculative * common_speculative_init( break; } case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: { - common_ngram_map ngram_map = get_common_ngram_map(config); + common_ngram_map ngram_map = get_common_ngram_map(config.type, config.params.ngram_simple); uint16_t ngram_size_key = ngram_map.size_key; uint16_t mgram_size_value = ngram_map.size_value; auto config_simple = common_ngram_simple_config { - /* .size_ngram = */ ngram_size_key, - /* .size_mgram = */ mgram_size_value + /* .size_ngram = */ ngram_size_key, + /* .size_mgram = */ mgram_size_value }; auto state = std::make_unique( - /* .type = */ config.type, - /* .state = */ config_simple + /* .type = */ config.type, + /* .state = */ config_simple ); impls.push_back(std::move(state)); break; @@ -1030,18 +1090,17 @@ common_speculative * common_speculative_init( case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: { impls.push_back(std::make_unique( (config.type), - get_common_ngram_map(config) + get_common_ngram_map(config.type, config.params.ngram_map_k) )); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: { - GGML_ASSERT(config.params.ngram_mod); - impls.push_back(std::make_unique(config.type, *config.params.ngram_mod)); + GGML_ASSERT(config.params.ngram_mod.obj); + impls.push_back(std::make_unique(config.type, *config.params.ngram_mod.obj)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: { - auto state = create_state_ngram_cache( - params.lookup_cache_static, params.lookup_cache_dynamic, config); + auto state = create_state_ngram_cache(params.ngram_cache.lookup_cache_static, params.ngram_cache.lookup_cache_dynamic, config); impls.push_back(std::make_unique(state)); break; } @@ -1099,6 +1158,15 @@ llama_tokens common_speculative_draft( impl->n_call_draft++; } + { + const int n_min = impl->n_min(params); + + if (!result.empty() && (int) result.size() < n_min) { + LOG_DBG("%s: ignoring small draft: %d < %d\n", __func__, (int) result.size(), n_min); + result.clear(); + } + } + if (!result.empty()) { LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__, common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(), @@ -1108,7 +1176,7 @@ llama_tokens common_speculative_draft( impl->n_gen_drafts++; impl->n_gen_tokens += result.size(); - break; // We have a draft, so break out of the loop and return it. + break; // we have a draft, so break out of the loop and return it. } } @@ -1136,6 +1204,32 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) { } } +int32_t common_speculative_n_max(const common_speculative * spec, const common_params_speculative & params) { + if (spec == nullptr) { + return 0; + } + + int32_t n_max = 0; + for (const auto & impl : spec->impls) { + n_max = std::max(n_max, impl->n_max(params)); + } + + return n_max; +} + +int32_t common_speculative_n_min(const common_speculative * spec, const common_params_speculative & params) { + if (spec == nullptr) { + return 0; + } + + int32_t n_min = 0; + for (const auto & impl : spec->impls) { + n_min = std::max(n_min, impl->n_min(params)); + } + + return n_min; +} + void common_speculative_print_stats(const common_speculative * spec) { if (spec == nullptr) { return; diff --git a/common/speculative.h b/common/speculative.h index bca78d32b..147447631 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -33,6 +33,9 @@ llama_tokens common_speculative_draft( // informs the speculative decoder that n_accepted tokens were accepted by the target model void common_speculative_accept(common_speculative * spec, uint16_t n_accepted); +int32_t common_speculative_n_max(const common_speculative * spec, const common_params_speculative & params); +int32_t common_speculative_n_min(const common_speculative * spec, const common_params_speculative & params); + // print statistics about the speculative decoding void common_speculative_print_stats(const common_speculative * spec); diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp index 7ba7d79f7..baf61bf27 100644 --- a/examples/gen-docs/gen-docs.cpp +++ b/examples/gen-docs/gen-docs.cpp @@ -73,12 +73,12 @@ static void write_help(std::ostringstream & ss, const md_file & md) { auto ctx_arg = common_params_parser_init(params, md.ex); std::vector common_options; - std::vector sparam_options; + std::vector sampling_options; std::vector specific_options; for (auto & opt : ctx_arg.options) { // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example - if (opt.is_sparam) { - sparam_options.push_back(&opt); + if (opt.is_sampling) { + sampling_options.push_back(&opt); } else if (opt.in_example(ctx_arg.ex)) { specific_options.push_back(&opt); } else { @@ -93,7 +93,7 @@ static void write_help(std::ostringstream & ss, const md_file & md) { ss << "### Common params\n\n"; write_table(ss, common_options); ss << "\n\n### Sampling params\n\n"; - write_table(ss, sparam_options); + write_table(ss, sampling_options); ss << "\n\n### " << md.specific_section_header << "\n\n"; write_table(ss, specific_options); diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp index 6b8f258a4..c0f6c8fc2 100644 --- a/examples/lookup/lookup-create.cpp +++ b/examples/lookup/lookup-create.cpp @@ -37,9 +37,9 @@ int main(int argc, char ** argv){ common_ngram_cache ngram_cache; common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true); - fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.speculative.lookup_cache_static.c_str()); + fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.speculative.ngram_cache.lookup_cache_static.c_str()); - common_ngram_cache_save(ngram_cache, params.speculative.lookup_cache_static); + common_ngram_cache_save(ngram_cache, params.speculative.ngram_cache.lookup_cache_static); return 0; } diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp index 847976ddc..84642e05d 100644 --- a/examples/lookup/lookup-stats.cpp +++ b/examples/lookup/lookup-stats.cpp @@ -24,7 +24,7 @@ int main(int argc, char ** argv){ return 1; } - const int n_draft = params.speculative.n_max; + const int n_draft = params.speculative.draft.n_max; // init llama.cpp llama_backend_init(); @@ -49,18 +49,18 @@ int main(int argc, char ** argv){ { const int64_t t_start_draft_us = ggml_time_us(); - if (!params.speculative.lookup_cache_static.empty()) { + if (!params.speculative.ngram_cache.lookup_cache_static.empty()) { try { - ngram_cache_static = common_ngram_cache_load(params.speculative.lookup_cache_static); + ngram_cache_static = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_static); } catch (std::ifstream::failure const &) { - LOG_ERR("failed to open static lookup cache: %s", params.speculative.lookup_cache_static.c_str()); + LOG_ERR("failed to open static lookup cache: %s", params.speculative.ngram_cache.lookup_cache_static.c_str()); exit(1); } } - if (!params.speculative.lookup_cache_dynamic.empty()) { + if (!params.speculative.ngram_cache.lookup_cache_dynamic.empty()) { try { - ngram_cache_dynamic = common_ngram_cache_load(params.speculative.lookup_cache_dynamic); + ngram_cache_dynamic = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_dynamic); } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 74272f17e..2d4c0e528 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -25,7 +25,7 @@ int main(int argc, char ** argv){ } // max. number of additional tokens to draft if match is found - const int n_draft = params.speculative.n_max; + const int n_draft = params.speculative.draft.n_max; // init llama.cpp llama_backend_init(); @@ -54,18 +54,18 @@ int main(int argc, char ** argv){ const int64_t t_start_draft_us = ggml_time_us(); common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false); - if (!params.speculative.lookup_cache_static.empty()) { + if (!params.speculative.ngram_cache.lookup_cache_static.empty()) { try { - ngram_cache_static = common_ngram_cache_load(params.speculative.lookup_cache_static); + ngram_cache_static = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_static); } catch (std::ifstream::failure const &) { - LOG_ERR("failed to open static lookup cache: %s", params.speculative.lookup_cache_static.c_str()); + LOG_ERR("failed to open static lookup cache: %s", params.speculative.ngram_cache.lookup_cache_static.c_str()); exit(1); } } - if (!params.speculative.lookup_cache_dynamic.empty()) { + if (!params.speculative.ngram_cache.lookup_cache_dynamic.empty()) { try { - ngram_cache_dynamic = common_ngram_cache_load(params.speculative.lookup_cache_dynamic); + ngram_cache_dynamic = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_dynamic); } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program } @@ -213,7 +213,7 @@ int main(int argc, char ** argv){ // Update dynamic ngram cache with context ngram cache and save it to disk: common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); - common_ngram_cache_save(ngram_cache_dynamic, params.speculative.lookup_cache_dynamic); + common_ngram_cache_save(ngram_cache_dynamic, params.speculative.ngram_cache.lookup_cache_dynamic); LOG("\n\n"); diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index 73394b74e..5b61b62a1 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -43,7 +43,7 @@ int main(int argc, char ** argv) { return 1; } - if (params.speculative.mparams_dft.path.empty()) { + if (params.speculative.draft.mparams.path.empty()) { LOG_ERR("%s: --model-draft is required\n", __func__); return 1; } @@ -77,7 +77,7 @@ int main(int argc, char ** argv) { // TODO: simplify this logic { - const auto & params_spec = params.speculative; + const auto & params_spec = params.speculative.draft; auto params_dft = params; @@ -85,15 +85,15 @@ int main(int argc, char ** argv) { params_dft.n_ctx = params_spec.n_ctx; params_dft.n_batch = llama_n_ctx_seq(ctx_tgt); params_dft.devices = params_spec.devices; - params_dft.model = params_spec.mparams_dft; + params_dft.model = params_spec.mparams; params_dft.n_gpu_layers = params_spec.n_gpu_layers; if (params_spec.cpuparams.n_threads > 0) { - params_dft.cpuparams.n_threads = params.speculative.cpuparams.n_threads; - params_dft.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; + params_dft.cpuparams.n_threads = params.speculative.draft.cpuparams.n_threads; + params_dft.cpuparams_batch.n_threads = params.speculative.draft.cpuparams_batch.n_threads; } - params_dft.tensor_buft_overrides = params.speculative.tensor_buft_overrides; + params_dft.tensor_buft_overrides = params.speculative.draft.tensor_buft_overrides; auto mparams_dft = common_model_params_to_llama(params_dft); @@ -103,8 +103,8 @@ int main(int argc, char ** argv) { return 1; } - params.speculative.model_dft = model_dft.get(); - params.speculative.cparams_dft = common_context_params_to_llama(params_dft); + params.speculative.draft.model = model_dft.get(); + params.speculative.draft.cparams = common_context_params_to_llama(params_dft); } // Tokenize the prompt @@ -187,16 +187,6 @@ int main(int argc, char ** argv) { // generate a new draft draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last); - if ((int) draft.size() > params_spec.n_max) { - LOG_WRN("draft size %zu exceeds max %d, truncating\n", draft.size(), params_spec.n_max); - draft.resize(params_spec.n_max); - } - - if ((int) draft.size() < params_spec.n_min) { - LOG_DBG("ignoring small draft: %zu < %d\n", draft.size(), params_spec.n_min); - draft.clear(); - } - // save the original draft size n_draft = draft.size(); @@ -220,19 +210,12 @@ int main(int argc, char ** argv) { } } - GGML_ASSERT(n_draft > 0); - // always have a token to evaluate from before - id_last common_batch_clear(batch_tgt); common_batch_add (batch_tgt, id_last, n_past++, { 0 }, true); // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1] { - // do not waste time on small drafts - if (draft.size() < (size_t) params_spec.n_min) { - draft.clear(); - } - for (size_t i = 0; i < draft.size(); ++i) { common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true); } @@ -340,7 +323,7 @@ int main(int argc, char ** argv) { LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); LOG_INF("\n"); - LOG_INF("n_draft = %d\n", params_spec.n_max); + LOG_INF("n_draft = %d\n", params_spec.draft.n_max); LOG_INF("n_predict = %d\n", n_predict); LOG_INF("n_drafted = %d\n", n_drafted); LOG_INF("n_accept = %d\n", n_accept); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 8f56a659b..6ed9c9143 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -49,7 +49,7 @@ int main(int argc, char ** argv) { return 1; } - if (params.speculative.mparams_dft.path.empty()) { + if (params.speculative.draft.mparams.path.empty()) { LOG_ERR("%s: --model-draft is required\n", __func__); return 1; } @@ -58,7 +58,7 @@ int main(int argc, char ** argv) { const int n_seq_dft = params.n_parallel; // probability threshold for splitting a draft branch (only for n_seq_dft > 1) - const float p_draft_split = params.speculative.p_split; + const float p_draft_split = params.speculative.draft.p_split; std::default_random_engine rng(params.sampling.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sampling.seed); std::uniform_real_distribution<> u_dist; @@ -80,15 +80,15 @@ int main(int argc, char ** argv) { ctx_tgt = llama_init_tgt->context(); // load the draft model - params.devices = params.speculative.devices; - params.model = params.speculative.mparams_dft; - params.n_gpu_layers = params.speculative.n_gpu_layers; - if (params.speculative.cpuparams.n_threads > 0) { - params.cpuparams.n_threads = params.speculative.cpuparams.n_threads; + params.devices = params.speculative.draft.devices; + params.model = params.speculative.draft.mparams; + params.n_gpu_layers = params.speculative.draft.n_gpu_layers; + if (params.speculative.draft.cpuparams.n_threads > 0) { + params.cpuparams.n_threads = params.speculative.draft.cpuparams.n_threads; } - params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads; - params.tensor_buft_overrides = params.speculative.tensor_buft_overrides; + params.cpuparams_batch.n_threads = params.speculative.draft.cpuparams_batch.n_threads; + params.tensor_buft_overrides = params.speculative.draft.tensor_buft_overrides; auto llama_init_dft = common_init_from_params(params); @@ -183,7 +183,7 @@ int main(int argc, char ** argv) { //GGML_ASSERT(n_vocab == llama_vocab_n_tokens(model_dft)); // how many tokens to draft each time - int n_draft = params.speculative.n_max; + int n_draft = params.speculative.draft.n_max; int n_predict = 0; int n_drafted = 0; diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index 67f8ca632..0dd8422e7 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -40,8 +40,12 @@ int main(void) { } } + // exclude spec args from this check + // ref: https://github.com/ggml-org/llama.cpp/pull/22397 + const bool skip = opt.is_spec; + // ensure shorter argument precedes longer argument - if (opt.args.size() > 1) { + if (!skip && opt.args.size() > 1) { const std::string first(opt.args.front()); const std::string last(opt.args.back()); @@ -124,9 +128,9 @@ int main(void) { assert(params.n_batch == 9090); // --draft cannot be used outside llama-speculative - argv = {"binary_name", "--draft", "123"}; + argv = {"binary_name", "--spec-draft-n-max", "123"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE)); - assert(params.speculative.n_max == 123); + assert(params.speculative.draft.n_max == 123); // multi-value args (CSV) argv = {"binary_name", "--lora", "file1.gguf,\"file2,2.gguf\",\"file3\"\"3\"\".gguf\",file4\".gguf"}; diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index e21a80e69..07198fb16 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -372,7 +372,7 @@ static const cmd_params cmd_params_defaults = { /* n_ubatch */ { 512 }, /* type_k */ { GGML_TYPE_F16 }, /* type_v */ { GGML_TYPE_F16 }, - /* n_threads */ { cpu_get_num_math() }, + /* n_threads */ { common_cpu_get_num_math() }, /* cpu_mask */ { "0x0" }, /* cpu_strict */ { false }, /* poll */ { 50 }, diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 567ddd4f9..e3822225b 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -309,8 +309,10 @@ struct server_slot { return 0; } + const int n_draft_min = common_speculative_n_min(spec.get(), task->params.speculative); + // determine the max draft that fits the current slot state - int n_draft_max = task->params.speculative.n_max; + int n_draft_max = common_speculative_n_max(spec.get(), task->params.speculative); // note: slot.prompt is not yet expanded with the `id` token sampled above // also, need to leave space for 1 extra token to allow context shifts @@ -322,8 +324,8 @@ struct server_slot { SLT_DBG(*this, "max possible draft: %d\n", n_draft_max); - if (n_draft_max < task->params.speculative.n_min) { - SLT_DBG(*this, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, task->params.speculative.n_min); + if (n_draft_max < n_draft_min) { + SLT_DBG(*this, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, n_draft_min); n_draft_max = 0; } @@ -358,11 +360,6 @@ struct server_slot { spec_draft.resize(n_draft_max); } - if (spec_draft.size() < (size_t) params_spec.n_min) { - SLT_DBG(*this, "ignoring small draft: %d < %d\n", (int) spec_draft.size(), params_spec.n_min); - spec_draft.clear(); - } - if (!spec_draft.empty() && ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) { const auto n_tokens = prompt.tokens.size(); @@ -770,9 +767,9 @@ private: if (params_base.speculative.has_dft()) { // TODO speculative: move to common/speculative.cpp? - SRV_INF("loading draft model '%s'\n", params_base.speculative.mparams_dft.path.c_str()); + const auto & params_spec = params_base.speculative.draft; - const auto & params_spec = params_base.speculative; + SRV_INF("loading draft model '%s'\n", params_spec.mparams.path.c_str()); auto params_dft = params_base; @@ -780,7 +777,7 @@ private: params_dft.n_ctx = params_spec.n_ctx == 0 ? llama_n_ctx_seq(ctx) : params_spec.n_ctx; params_dft.n_batch = llama_n_ctx_seq(ctx); params_dft.devices = params_spec.devices; - params_dft.model = params_spec.mparams_dft; + params_dft.model = params_spec.mparams; params_dft.n_gpu_layers = params_spec.n_gpu_layers; params_dft.cache_type_k = params_spec.cache_type_k; params_dft.cache_type_v = params_spec.cache_type_v; @@ -800,8 +797,8 @@ private: return false; } - params_base.speculative.model_dft = model_dft.get(); - params_base.speculative.cparams_dft = common_context_params_to_llama(params_dft); + params_base.speculative.draft.model = model_dft.get(); + params_base.speculative.draft.cparams = common_context_params_to_llama(params_dft); } std::string & mmproj_path = params_base.mmproj.path; @@ -1310,7 +1307,7 @@ private: backend_sampling &= task.params.sampling.backend_sampling; // TODO: speculative decoding requires multiple samples per batch - not supported yet - backend_sampling &= !(slot.can_speculate() && task.params.speculative.n_max > 0); + backend_sampling &= !(slot.can_speculate() && common_speculative_n_max(slot.spec.get(), task.params.speculative) > 0); // TODO: getting post/pre sampling logits is not yet supported with backend sampling backend_sampling &= !need_logits; diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 4c341d7c5..45e5168fa 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -76,13 +76,7 @@ json task_params::to_json(bool only_metrics) const { {"reasoning_in_content", chat_parser_params.reasoning_in_content}, {"generation_prompt", chat_parser_params.generation_prompt}, {"samplers", samplers}, - {"speculative.n_max", speculative.n_max}, - {"speculative.n_min", speculative.n_min}, - {"speculative.p_min", speculative.p_min}, {"speculative.type", common_speculative_type_to_str(speculative.type)}, - {"speculative.ngram_size_n", speculative.ngram_size_n}, - {"speculative.ngram_size_m", speculative.ngram_size_m}, - {"speculative.ngram_m_hits", speculative.ngram_min_hits}, {"timings_per_token", timings_per_token}, {"post_sampling_probs", post_sampling_probs}, {"backend_sampling", sampling.backend_sampling}, @@ -139,13 +133,7 @@ json task_params::to_json(bool only_metrics) const { {"reasoning_in_content", chat_parser_params.reasoning_in_content}, {"generation_prompt", chat_parser_params.generation_prompt}, {"samplers", samplers}, - {"speculative.n_max", speculative.n_max}, - {"speculative.n_min", speculative.n_min}, - {"speculative.p_min", speculative.p_min}, {"speculative.type", common_speculative_type_to_str(speculative.type)}, - {"speculative.ngram_size_n", speculative.ngram_size_n}, - {"speculative.ngram_size_m", speculative.ngram_size_m}, - {"speculative.ngram_m_hits", speculative.ngram_min_hits}, {"timings_per_token", timings_per_token}, {"post_sampling_probs", post_sampling_probs}, {"backend_sampling", sampling.backend_sampling}, @@ -308,14 +296,17 @@ task_params server_task::params_from_json_cmpl( params.speculative = defaults.speculative; - params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); - params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); - params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); + // TODO: for now, be able to adjust only the draft-model based speculative parameters + params.speculative.draft.n_min = json_value(data, "speculative.n_min", defaults.speculative.draft.n_min); + params.speculative.draft.n_max = json_value(data, "speculative.n_max", defaults.speculative.draft.n_max); + params.speculative.draft.p_min = json_value(data, "speculative.p_min", defaults.speculative.draft.p_min); - params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min); - params.speculative.n_min = std::max(params.speculative.n_min, 0); - params.speculative.n_max = std::max(params.speculative.n_max, 0); + params.speculative.draft.n_min = std::min(params.speculative.draft.n_max, params.speculative.draft.n_min); + params.speculative.draft.n_min = std::max(params.speculative.draft.n_min, 0); + params.speculative.draft.n_max = std::max(params.speculative.draft.n_max, 0); +#if 0 + // for debugging and research purposes params.speculative.type = common_speculative_type_from_name(json_value(data, "speculative.type", common_speculative_type_to_str(defaults.speculative.type))); params.speculative.ngram_size_n = json_value(data, "speculative.ngram_size_n", defaults.speculative.ngram_size_n); @@ -325,6 +316,7 @@ task_params server_task::params_from_json_cmpl( params.speculative.ngram_size_n = std::max(std::min(1, (int) params.speculative.ngram_size_n), 1024); params.speculative.ngram_size_m = std::max(std::min(1, (int) params.speculative.ngram_size_m), 1024); params.speculative.ngram_min_hits = std::max(std::min(1, (int) params.speculative.ngram_min_hits), 1024); +#endif // Use OpenAI API logprobs only if n_probs wasn't provided if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){ diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index ddbb76c9a..88700487b 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -83,15 +83,14 @@ class ServerProcess: kv_unified: bool | None = False server_slots: bool | None = False pooling: str | None = None - draft: int | None = None api_key: str | None = None models_dir: str | None = None models_max: int | None = None no_models_autoload: bool | None = None lora_files: List[str] | None = None enable_ctx_shift: int | None = False - draft_min: int | None = None - draft_max: int | None = None + spec_draft_n_min: int | None = None + spec_draft_n_max: int | None = None no_webui: bool | None = None jinja: bool | None = None reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None @@ -165,8 +164,6 @@ class ServerProcess: server_args.extend(["--threads", self.n_threads]) if self.n_gpu_layer: server_args.extend(["--n-gpu-layers", self.n_gpu_layer]) - if self.draft is not None: - server_args.extend(["--draft", self.draft]) if self.server_continuous_batching: server_args.append("--cont-batching") if self.server_embeddings: @@ -214,10 +211,10 @@ class ServerProcess: server_args.append("--context-shift") if self.api_key: server_args.extend(["--api-key", self.api_key]) - if self.draft_max: - server_args.extend(["--draft-max", self.draft_max]) - if self.draft_min: - server_args.extend(["--draft-min", self.draft_min]) + if self.spec_draft_n_max: + server_args.extend(["--spec-draft-n-max", self.spec_draft_n_max]) + if self.spec_draft_n_min: + server_args.extend(["--spec-draft-n-min", self.spec_draft_n_min]) if self.no_webui: server_args.append("--no-webui") if self.no_models_autoload: