diff --git a/common/arg.cpp b/common/arg.cpp index 5c0bbe38e..85d84e5cc 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3122,14 +3122,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)", [](common_params & params, int value) { if (value < -1) { throw std::invalid_argument("invalid value"); } - params.reasoning_budget = value; + params.sampling.reasoning_budget_tokens = value; } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET")); add_opt(common_arg( {"--reasoning-budget-message"}, "MESSAGE", "message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)", [](common_params & params, const std::string & value) { - params.reasoning_budget_message = value; + params.sampling.reasoning_budget_message = value; } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE")); add_opt(common_arg( diff --git a/common/common.h b/common/common.h index 4137a87f1..9a8218433 100644 --- a/common/common.h +++ b/common/common.h @@ -274,6 +274,7 @@ struct common_params_sampling { std::vector reasoning_budget_start; // start tag token sequence std::vector reasoning_budget_end; // end tag token sequence std::vector reasoning_budget_forced; // forced sequence (message + end tag) + std::string reasoning_budget_message; // message injected before end tag when budget exhausted bool backend_sampling = false; @@ -581,8 +582,6 @@ struct common_params { bool force_pure_content_parser = false; common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable - int reasoning_budget = -1; - std::string reasoning_budget_message; // message injected before end tag when budget exhausted bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 6ee00012d..5bdb1e78f 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -77,8 +77,8 @@ struct cli_context { // defaults.return_progress = true; // TODO: show progress verbose_prompt = params.verbose_prompt; - reasoning_budget = params.reasoning_budget; - reasoning_budget_message = params.reasoning_budget_message; + reasoning_budget = params.sampling.reasoning_budget_tokens; + reasoning_budget_message = params.sampling.reasoning_budget_message; } std::string generate_completion(result_timings & out_timings) { diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index a55c6356f..53f61b5a9 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1045,8 +1045,8 @@ private: /* allow_image */ mctx ? mtmd_support_vision(mctx) : false, /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, /* enable_thinking */ enable_thinking, - /* reasoning_budget */ params_base.reasoning_budget, - /* reasoning_budget_msg */ params_base.reasoning_budget_message, + /* reasoning_budget */ params_base.sampling.reasoning_budget_tokens, + /* reasoning_budget_msg */ params_base.sampling.reasoning_budget_message, /* media_path */ params_base.media_path, /* force_pure_content */ params_base.force_pure_content_parser };