server : refactor oai_parser_opt, move it to server_chat_params (#18937)

* server_chat_params

* move chat format into CLI

* use meta whenever possible

* clean up, no more chatml fallback
This commit is contained in:
Xuan-Son Nguyen
2026-01-19 23:28:01 +01:00
committed by GitHub
parent 1706a6d7c6
commit 6df686bee6
8 changed files with 112 additions and 103 deletions
+7 -7
View File
@@ -601,18 +601,18 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
return tmpls->has_explicit_template; return tmpls->has_explicit_template;
} }
const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant) { std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
if (variant != nullptr) { if (!variant.empty()) {
if (strcmp(variant, "tool_use") == 0) { if (variant == "tool_use") {
if (tmpls->template_tool_use) { if (tmpls->template_tool_use) {
return tmpls->template_tool_use->source().c_str(); return tmpls->template_tool_use->source();
} }
return nullptr; return "";
} else { } else {
LOG_DBG("%s: unknown template variant: %s\n", __func__, variant); LOG_DBG("%s: unknown template variant: %s\n", __func__, variant.c_str());
} }
} }
return tmpls->template_default->source().c_str(); return tmpls->template_default->source();
} }
common_chat_templates_ptr common_chat_templates_init( common_chat_templates_ptr common_chat_templates_init(
+1 -1
View File
@@ -191,7 +191,7 @@ common_chat_templates_ptr common_chat_templates_init(
const std::string & eos_token_override = ""); const std::string & eos_token_override = "");
bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls); bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr); std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
struct common_chat_params common_chat_templates_apply( struct common_chat_params common_chat_templates_apply(
+27 -5
View File
@@ -71,14 +71,16 @@ struct cli_context {
std::string generate_completion(result_timings & out_timings) { std::string generate_completion(result_timings & out_timings) {
server_response_reader rd = ctx_server.get_response_reader(); server_response_reader rd = ctx_server.get_response_reader();
auto formatted = format_chat();
{ {
// TODO: reduce some copies here in the future // TODO: reduce some copies here in the future
server_task task = server_task(SERVER_TASK_TYPE_COMPLETION); server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
task.id = rd.get_new_id(); task.id = rd.get_new_id();
task.index = 0; task.index = 0;
task.params = defaults; // copy task.params = defaults; // copy
task.cli_input = messages; // copy task.cli_prompt = formatted.prompt; // copy
task.cli_files = input_files; // copy task.cli_files = input_files; // copy
task.cli = true;
rd.post_task({std::move(task)}); rd.post_task({std::move(task)});
} }
@@ -156,6 +158,26 @@ struct cli_context {
return content; return content;
} }
} }
common_chat_params format_chat() {
auto meta = ctx_server.get_meta();
auto & chat_params = meta.chat_params;
common_chat_templates_inputs inputs;
inputs.messages = common_chat_msgs_parse_oaicompat(messages);
inputs.tools = {}; // TODO
inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
inputs.json_schema = ""; // TODO
inputs.grammar = ""; // TODO
inputs.use_jinja = chat_params.use_jinja;
inputs.parallel_tool_calls = false;
inputs.add_generation_prompt = true;
inputs.reasoning_format = chat_params.reasoning_format;
inputs.enable_thinking = chat_params.enable_thinking;
// Apply chat template to the list of messages
return common_chat_templates_apply(chat_params.tmpls.get(), inputs);
}
}; };
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
+2 -2
View File
@@ -831,7 +831,7 @@ static void handle_media(
// used by /chat/completions endpoint // used by /chat/completions endpoint
json oaicompat_chat_params_parse( json oaicompat_chat_params_parse(
json & body, /* openai api json semantics */ json & body, /* openai api json semantics */
const oaicompat_parser_options & opt, const server_chat_params & opt,
std::vector<raw_buffer> & out_files) std::vector<raw_buffer> & out_files)
{ {
json llama_params; json llama_params;
@@ -1012,7 +1012,7 @@ json oaicompat_chat_params_parse(
} }
// Apply chat template to the list of messages // Apply chat template to the list of messages
auto chat_params = common_chat_templates_apply(opt.tmpls, inputs); auto chat_params = common_chat_templates_apply(opt.tmpls.get(), inputs);
/* Append assistant prefilled message */ /* Append assistant prefilled message */
if (prefill_assistant_message) { if (prefill_assistant_message) {
+7 -7
View File
@@ -274,25 +274,25 @@ std::vector<server_tokens> tokenize_input_prompts(
// OAI utils // OAI utils
// //
// used by /completions endpoint struct server_chat_params {
json oaicompat_completion_params_parse(const json & body);
struct oaicompat_parser_options {
bool use_jinja; bool use_jinja;
bool prefill_assistant; bool prefill_assistant;
common_reasoning_format reasoning_format; common_reasoning_format reasoning_format;
std::map<std::string,std::string> chat_template_kwargs; std::map<std::string, std::string> chat_template_kwargs; // mapping key --> json value
common_chat_templates * tmpls; common_chat_templates_ptr tmpls;
bool allow_image; bool allow_image;
bool allow_audio; bool allow_audio;
bool enable_thinking = true; bool enable_thinking = true;
std::string media_path; std::string media_path;
}; };
// used by /completions endpoint
json oaicompat_completion_params_parse(const json & body);
// used by /chat/completions endpoint // used by /chat/completions endpoint
json oaicompat_chat_params_parse( json oaicompat_chat_params_parse(
json & body, /* openai api json semantics */ json & body, /* openai api json semantics */
const oaicompat_parser_options & opt, const server_chat_params & opt,
std::vector<raw_buffer> & out_files); std::vector<raw_buffer> & out_files);
// convert Anthropic Messages API format to OpenAI Chat Completions API format // convert Anthropic Messages API format to OpenAI Chat Completions API format
+62 -76
View File
@@ -534,8 +534,8 @@ public:
server_queue queue_tasks; server_queue queue_tasks;
server_response queue_results; server_response queue_results;
common_chat_templates_ptr chat_templates; // note: chat_params must not be refreshed upon existing sleeping state
oaicompat_parser_options oai_parser_opt; server_chat_params chat_params;
~server_context_impl() { ~server_context_impl() {
if (!sleeping) { if (!sleeping) {
@@ -688,15 +688,6 @@ private:
llama_init_dft->free_context(); llama_init_dft->free_context();
} }
chat_templates = common_chat_templates_init(model, params_base.chat_template);
try {
common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs);
} catch (const std::exception & e) {
SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
chat_templates = common_chat_templates_init(model, "chatml");
}
std::string & mmproj_path = params_base.mmproj.path; std::string & mmproj_path = params_base.mmproj.path;
if (!mmproj_path.empty()) { if (!mmproj_path.empty()) {
if (!is_resume) { if (!is_resume) {
@@ -845,30 +836,6 @@ private:
model_name = model_path.filename().string(); model_name = model_path.filename().string();
} }
// thinking is enabled if:
// 1. It's not explicitly disabled (reasoning_budget == 0)
// 2. The chat template supports it
const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
SRV_INF("thinking = %d\n", enable_thinking);
oai_parser_opt = {
/* use_jinja */ params_base.use_jinja,
/* prefill_assistant */ params_base.prefill_assistant,
/* reasoning_format */ params_base.reasoning_format,
/* chat_template_kwargs */ params_base.default_template_kwargs,
/* common_chat_templates */ chat_templates.get(),
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
/* enable_thinking */ enable_thinking,
/* media_path */ params_base.media_path,
};
// print sample chat example to make it clear which template is used
// @ngxson modern templates are too long, spam the logs; printing the example is enough
LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
// common_chat_templates_source(chat_templates.get()),
common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
if (!is_resume) { if (!is_resume) {
return init(); return init();
} }
@@ -907,6 +874,42 @@ private:
} }
} }
// populate chat template params
{
common_chat_templates_ptr chat_templates;
try {
chat_templates = common_chat_templates_init(model, params_base.chat_template);
LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
} catch (const std::exception & e) {
SRV_ERR("%s: chat template parsing error: %s\n", __func__, e.what());
SRV_ERR("%s: please consider disabling jinja via --no-jinja, or use a custom chat template via --chat-template\n", __func__);
SRV_ERR("%s: for example: --no-jinja --chat-template chatml\n", __func__);
return false;
}
// thinking is enabled if:
// 1. It's not explicitly disabled (reasoning_budget == 0)
// 2. The chat template supports it
const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking);
chat_params = {
/* use_jinja */ params_base.use_jinja,
/* prefill_assistant */ params_base.prefill_assistant,
/* reasoning_format */ params_base.reasoning_format,
/* chat_template_kwargs */ params_base.default_template_kwargs,
/* tmpls */ std::move(chat_templates),
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
/* enable_thinking */ enable_thinking,
/* media_path */ params_base.media_path,
};
}
return true; return true;
} }
@@ -1588,32 +1591,14 @@ private:
// tokenize the input if it's set by CLI, return false on error // tokenize the input if it's set by CLI, return false on error
bool tokenize_cli_input(server_task & task) { bool tokenize_cli_input(server_task & task) {
GGML_ASSERT(task.cli_input != nullptr);
try { try {
auto & opt = oai_parser_opt; auto & prompt = task.cli_prompt;
common_chat_templates_inputs inputs;
inputs.messages = common_chat_msgs_parse_oaicompat(task.cli_input);
inputs.tools = {}; // TODO
inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
inputs.json_schema = ""; // TODO
inputs.grammar = ""; // TODO
inputs.use_jinja = opt.use_jinja;
inputs.parallel_tool_calls = false;
inputs.add_generation_prompt = true;
inputs.reasoning_format = opt.reasoning_format;
inputs.enable_thinking = opt.enable_thinking;
// Apply chat template to the list of messages
auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
// tokenize the resulting prompt
auto & prompt = chat_params.prompt;
if (mctx != nullptr) { if (mctx != nullptr) {
task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files); task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
} else { } else {
task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]); task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
} }
task.cli_input.clear(); task.cli_prompt.clear();
task.cli_files.clear(); task.cli_files.clear();
} catch (const std::exception & e) { } catch (const std::exception & e) {
send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST); send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
@@ -1689,7 +1674,7 @@ private:
{ {
// special case: if input is provided via CLI, tokenize it first // special case: if input is provided via CLI, tokenize it first
// otherwise, no need to tokenize as it's already done inside the HTTP thread // otherwise, no need to tokenize as it's already done inside the HTTP thread
if (task.cli_input != nullptr) { if (task.cli) {
if (!tokenize_cli_input(task)) { if (!tokenize_cli_input(task)) {
break; break;
} }
@@ -2901,8 +2886,6 @@ server_response_reader server_context::get_response_reader() {
} }
server_context_meta server_context::get_meta() const { server_context_meta server_context::get_meta() const {
auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use");
auto bos_id = llama_vocab_bos(impl->vocab); auto bos_id = llama_vocab_bos(impl->vocab);
auto eos_id = llama_vocab_eos(impl->vocab); auto eos_id = llama_vocab_eos(impl->vocab);
auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : ""; auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
@@ -2913,14 +2896,13 @@ server_context_meta server_context::get_meta() const {
/* model_name */ impl->model_name, /* model_name */ impl->model_name,
/* model_path */ impl->params_base.model.path, /* model_path */ impl->params_base.model.path,
/* has_mtmd */ impl->mctx != nullptr, /* has_mtmd */ impl->mctx != nullptr,
/* has_inp_image */ impl->oai_parser_opt.allow_image, /* has_inp_image */ impl->chat_params.allow_image,
/* has_inp_audio */ impl->oai_parser_opt.allow_audio, /* has_inp_audio */ impl->chat_params.allow_audio,
/* json_webui_settings */ impl->json_webui_settings, /* json_webui_settings */ impl->json_webui_settings,
/* slot_n_ctx */ impl->get_slot_n_ctx(), /* slot_n_ctx */ impl->get_slot_n_ctx(),
/* pooling_type */ llama_pooling_type(impl->ctx), /* pooling_type */ llama_pooling_type(impl->ctx),
/* chat_template */ common_chat_templates_source(impl->chat_templates.get()), /* chat_params */ impl->chat_params,
/* chat_template_tool_use */ tool_use_src ? tool_use_src : "",
/* bos_token_str */ bos_token_str, /* bos_token_str */ bos_token_str,
/* eos_token_str */ eos_token_str, /* eos_token_str */ eos_token_str,
@@ -3202,8 +3184,8 @@ void server_routes::init_routes() {
// this endpoint can be accessed during sleeping // this endpoint can be accessed during sleeping
// the next LOC is to avoid someone accidentally use ctx_server // the next LOC is to avoid someone accidentally use ctx_server
bool server_ctx; // do NOT delete this line bool ctx_server; // do NOT delete this line
GGML_UNUSED(server_ctx); GGML_UNUSED(ctx_server);
res->ok({{"status", "ok"}}); res->ok({{"status", "ok"}});
return res; return res;
@@ -3393,8 +3375,8 @@ void server_routes::init_routes() {
// this endpoint can be accessed during sleeping // this endpoint can be accessed during sleeping
// the next LOC is to avoid someone accidentally use ctx_server // the next LOC is to avoid someone accidentally use ctx_server
bool server_ctx; // do NOT delete this line bool ctx_server; // do NOT delete this line
GGML_UNUSED(server_ctx); GGML_UNUSED(ctx_server);
task_params tparams; task_params tparams;
tparams.sampling = params.sampling; tparams.sampling = params.sampling;
@@ -3403,6 +3385,9 @@ void server_routes::init_routes() {
{ "n_ctx", meta->slot_n_ctx }, { "n_ctx", meta->slot_n_ctx },
}; };
std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
std::string tmpl_tools = common_chat_templates_source(meta->chat_params.tmpls.get(), "tool_use");
json props = { json props = {
{ "default_generation_settings", default_generation_settings_for_props }, { "default_generation_settings", default_generation_settings_for_props },
{ "total_slots", params.n_parallel }, { "total_slots", params.n_parallel },
@@ -3417,15 +3402,15 @@ void server_routes::init_routes() {
{ "endpoint_metrics", params.endpoint_metrics }, { "endpoint_metrics", params.endpoint_metrics },
{ "webui", params.webui }, { "webui", params.webui },
{ "webui_settings", meta->json_webui_settings }, { "webui_settings", meta->json_webui_settings },
{ "chat_template", meta->chat_template }, { "chat_template", tmpl_default },
{ "bos_token", meta->bos_token_str }, { "bos_token", meta->bos_token_str },
{ "eos_token", meta->eos_token_str }, { "eos_token", meta->eos_token_str },
{ "build_info", meta->build_info }, { "build_info", meta->build_info },
{ "is_sleeping", queue_tasks.is_sleeping() }, { "is_sleeping", queue_tasks.is_sleeping() },
}; };
if (params.use_jinja) { if (params.use_jinja) {
if (!meta->chat_template_tool_use.empty()) { if (!tmpl_tools.empty()) {
props["chat_template_tool_use"] = meta->chat_template_tool_use; props["chat_template_tool_use"] = tmpl_tools;
} }
} }
res->ok(props); res->ok(props);
@@ -3446,6 +3431,7 @@ void server_routes::init_routes() {
this->get_api_show = [this](const server_http_req &) { this->get_api_show = [this](const server_http_req &) {
auto res = create_response(); auto res = create_response();
std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
json data = { json data = {
{ {
"model_info", { "model_info", {
@@ -3454,7 +3440,7 @@ void server_routes::init_routes() {
}, },
{"modelfile", ""}, {"modelfile", ""},
{"parameters", ""}, {"parameters", ""},
{"template", meta->chat_template}, {"template", tmpl_default},
{"details", { {"details", {
{"parent_model", ""}, {"parent_model", ""},
{"format", "gguf"}, {"format", "gguf"},
@@ -3579,7 +3565,7 @@ void server_routes::init_routes() {
json body = json::parse(req.body); json body = json::parse(req.body);
json body_parsed = oaicompat_chat_params_parse( json body_parsed = oaicompat_chat_params_parse(
body, body,
ctx_server.oai_parser_opt, meta->chat_params,
files); files);
return handle_completions_impl( return handle_completions_impl(
req, req,
@@ -3595,7 +3581,7 @@ void server_routes::init_routes() {
json body = convert_anthropic_to_oai(json::parse(req.body)); json body = convert_anthropic_to_oai(json::parse(req.body));
json body_parsed = oaicompat_chat_params_parse( json body_parsed = oaicompat_chat_params_parse(
body, body,
ctx_server.oai_parser_opt, meta->chat_params,
files); files);
return handle_completions_impl( return handle_completions_impl(
req, req,
@@ -3611,7 +3597,7 @@ void server_routes::init_routes() {
json body = convert_anthropic_to_oai(json::parse(req.body)); json body = convert_anthropic_to_oai(json::parse(req.body));
json body_parsed = oaicompat_chat_params_parse( json body_parsed = oaicompat_chat_params_parse(
body, body,
ctx_server.oai_parser_opt, meta->chat_params,
files); files);
json prompt = body_parsed.at("prompt"); json prompt = body_parsed.at("prompt");
@@ -3627,7 +3613,7 @@ void server_routes::init_routes() {
json body = json::parse(req.body); json body = json::parse(req.body);
json data = oaicompat_chat_params_parse( json data = oaicompat_chat_params_parse(
body, body,
ctx_server.oai_parser_opt, meta->chat_params,
files); files);
res->ok({{ "prompt", std::move(data.at("prompt")) }}); res->ok({{ "prompt", std::move(data.at("prompt")) }});
return res; return res;
@@ -3638,8 +3624,8 @@ void server_routes::init_routes() {
// this endpoint can be accessed during sleeping // this endpoint can be accessed during sleeping
// the next LOC is to avoid someone accidentally use ctx_server // the next LOC is to avoid someone accidentally use ctx_server
bool server_ctx; // do NOT delete this line bool ctx_server; // do NOT delete this line
GGML_UNUSED(server_ctx); GGML_UNUSED(ctx_server);
json models = { json models = {
{"models", { {"models", {
+2 -3
View File
@@ -20,9 +20,8 @@ struct server_context_meta {
int slot_n_ctx; int slot_n_ctx;
enum llama_pooling_type pooling_type; enum llama_pooling_type pooling_type;
// chat template // chat params
std::string chat_template; server_chat_params & chat_params;
std::string chat_template_tool_use;
// tokens // tokens
std::string bos_token_str; std::string bos_token_str;
+4 -2
View File
@@ -130,8 +130,10 @@ struct server_task {
task_params params; task_params params;
server_tokens tokens; server_tokens tokens;
// only used by CLI, this delegates the tokenization to the server // only used by CLI, this allow tokenizing CLI inputs on server side
json cli_input = nullptr; // we need this because mtmd_context and vocab are not accessible outside of server_context
bool cli = false;
std::string cli_prompt;
std::vector<raw_buffer> cli_files; std::vector<raw_buffer> cli_files;
server_task_type type; server_task_type type;