This change refactors the reasoning_budget_message parameter from the common params into the sampling parameters specifically. It also removes the reasoning_budget common parameter and standardizes on the existing reasoning_budget_tokens parameter in the sampling configuration. Issue: https://github.com/ggml-org/llama.cpp/issues/20429 Original PR: https://github.com/ggml-org/llama.cpp/pull/20297
This commit is contained in:
+2
-2
@@ -3122,14 +3122,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
"token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
|
"token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
if (value < -1) { throw std::invalid_argument("invalid value"); }
|
if (value < -1) { throw std::invalid_argument("invalid value"); }
|
||||||
params.reasoning_budget = value;
|
params.sampling.reasoning_budget_tokens = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--reasoning-budget-message"}, "MESSAGE",
|
{"--reasoning-budget-message"}, "MESSAGE",
|
||||||
"message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
|
"message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.reasoning_budget_message = value;
|
params.sampling.reasoning_budget_message = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|||||||
+1
-2
@@ -274,6 +274,7 @@ struct common_params_sampling {
|
|||||||
std::vector<llama_token> reasoning_budget_start; // start tag token sequence
|
std::vector<llama_token> reasoning_budget_start; // start tag token sequence
|
||||||
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
|
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
|
||||||
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
|
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
|
||||||
|
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
|
||||||
|
|
||||||
bool backend_sampling = false;
|
bool backend_sampling = false;
|
||||||
|
|
||||||
@@ -581,8 +582,6 @@ struct common_params {
|
|||||||
bool force_pure_content_parser = false;
|
bool force_pure_content_parser = false;
|
||||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||||
int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
|
int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
|
||||||
int reasoning_budget = -1;
|
|
||||||
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
|
|
||||||
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
||||||
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
|
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
|
||||||
|
|
||||||
|
|||||||
+2
-2
@@ -77,8 +77,8 @@ struct cli_context {
|
|||||||
// defaults.return_progress = true; // TODO: show progress
|
// defaults.return_progress = true; // TODO: show progress
|
||||||
|
|
||||||
verbose_prompt = params.verbose_prompt;
|
verbose_prompt = params.verbose_prompt;
|
||||||
reasoning_budget = params.reasoning_budget;
|
reasoning_budget = params.sampling.reasoning_budget_tokens;
|
||||||
reasoning_budget_message = params.reasoning_budget_message;
|
reasoning_budget_message = params.sampling.reasoning_budget_message;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string generate_completion(result_timings & out_timings) {
|
std::string generate_completion(result_timings & out_timings) {
|
||||||
|
|||||||
@@ -1045,8 +1045,8 @@ private:
|
|||||||
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
||||||
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
||||||
/* enable_thinking */ enable_thinking,
|
/* enable_thinking */ enable_thinking,
|
||||||
/* reasoning_budget */ params_base.reasoning_budget,
|
/* reasoning_budget */ params_base.sampling.reasoning_budget_tokens,
|
||||||
/* reasoning_budget_msg */ params_base.reasoning_budget_message,
|
/* reasoning_budget_msg */ params_base.sampling.reasoning_budget_message,
|
||||||
/* media_path */ params_base.media_path,
|
/* media_path */ params_base.media_path,
|
||||||
/* force_pure_content */ params_base.force_pure_content_parser
|
/* force_pure_content */ params_base.force_pure_content_parser
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user