server : refactor oai_parser_opt, move it to server_chat_params (#18937)

* server_chat_params * move chat format into CLI * use meta whenever possible * clean up, no more chatml fallback
2026-01-19 23:28:01 +01:00
parent 1706a6d7c6
commit 6df686bee6
8 changed files with 112 additions and 103 deletions
@@ -601,18 +601,18 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
    return tmpls->has_explicit_template;
 }
-const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant) {
+std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
-    if (variant != nullptr) {
+    if (!variant.empty()) {
-        if (strcmp(variant, "tool_use") == 0) {
+        if (variant == "tool_use") {
            if (tmpls->template_tool_use) {
-                return tmpls->template_tool_use->source().c_str();
+                return tmpls->template_tool_use->source();
            }
-            return nullptr;
+            return "";
        } else {
-            LOG_DBG("%s: unknown template variant: %s\n", __func__, variant);
+            LOG_DBG("%s: unknown template variant: %s\n", __func__, variant.c_str());
        }
    }
-    return tmpls->template_default->source().c_str();
+    return tmpls->template_default->source();
 }
 common_chat_templates_ptr common_chat_templates_init(
@@ -191,7 +191,7 @@ common_chat_templates_ptr common_chat_templates_init(
                                           const std::string & eos_token_override = "");
 bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
-const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
+std::string  common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
 struct common_chat_params      common_chat_templates_apply(
@@ -71,14 +71,16 @@ struct cli_context {
    std::string generate_completion(result_timings & out_timings) {
        server_response_reader rd = ctx_server.get_response_reader();
        auto formatted = format_chat();
        {
            // TODO: reduce some copies here in the future
            server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
-            task.id        = rd.get_new_id();
+            task.id         = rd.get_new_id();
-            task.index     = 0;
+            task.index      = 0;
-            task.params    = defaults;    // copy
+            task.params     = defaults;         // copy
-            task.cli_input = messages;    // copy
+            task.cli_prompt = formatted.prompt; // copy
-            task.cli_files = input_files; // copy
+            task.cli_files  = input_files;      // copy
            task.cli        = true;
            rd.post_task({std::move(task)});
        }
@@ -156,6 +158,26 @@ struct cli_context {
            return content;
        }
    }
    common_chat_params format_chat() {
        auto meta = ctx_server.get_meta();
        auto & chat_params = meta.chat_params;
        common_chat_templates_inputs inputs;
        inputs.messages              = common_chat_msgs_parse_oaicompat(messages);
        inputs.tools                 = {}; // TODO
        inputs.tool_choice           = COMMON_CHAT_TOOL_CHOICE_NONE;
        inputs.json_schema           = ""; // TODO
        inputs.grammar               = ""; // TODO
        inputs.use_jinja             = chat_params.use_jinja;
        inputs.parallel_tool_calls   = false;
        inputs.add_generation_prompt = true;
        inputs.reasoning_format      = chat_params.reasoning_format;
        inputs.enable_thinking       = chat_params.enable_thinking;
        // Apply chat template to the list of messages
        return common_chat_templates_apply(chat_params.tmpls.get(), inputs);
    }
 };
 int main(int argc, char ** argv) {
@@ -831,7 +831,7 @@ static void handle_media(
 // used by /chat/completions endpoint
 json oaicompat_chat_params_parse(
    json & body, /* openai api json semantics */
-    const oaicompat_parser_options & opt,
+    const server_chat_params & opt,
    std::vector<raw_buffer> & out_files)
 {
    json llama_params;
@@ -1012,7 +1012,7 @@ json oaicompat_chat_params_parse(
    }
    // Apply chat template to the list of messages
-    auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
+    auto chat_params = common_chat_templates_apply(opt.tmpls.get(), inputs);
    /* Append assistant prefilled message */
    if (prefill_assistant_message) {
@@ -274,25 +274,25 @@ std::vector<server_tokens> tokenize_input_prompts(
 // OAI utils
 //
-// used by /completions endpoint
+struct server_chat_params {
 json oaicompat_completion_params_parse(const json & body);
 struct oaicompat_parser_options {
    bool use_jinja;
    bool prefill_assistant;
    common_reasoning_format reasoning_format;
-    std::map<std::string,std::string> chat_template_kwargs;
+    std::map<std::string, std::string> chat_template_kwargs; // mapping key --> json value
-    common_chat_templates * tmpls;
+    common_chat_templates_ptr tmpls;
    bool allow_image;
    bool allow_audio;
    bool enable_thinking = true;
    std::string media_path;
 };
 // used by /completions endpoint
 json oaicompat_completion_params_parse(const json & body);
 // used by /chat/completions endpoint
 json oaicompat_chat_params_parse(
    json & body, /* openai api json semantics */
-    const oaicompat_parser_options & opt,
+    const server_chat_params & opt,
    std::vector<raw_buffer> & out_files);
 // convert Anthropic Messages API format to OpenAI Chat Completions API format
@@ -534,8 +534,8 @@ public:
    server_queue    queue_tasks;
    server_response queue_results;
-    common_chat_templates_ptr chat_templates;
+    // note: chat_params must not be refreshed upon existing sleeping state
-    oaicompat_parser_options  oai_parser_opt;
+    server_chat_params chat_params;
    ~server_context_impl() {
        if (!sleeping) {
@@ -688,15 +688,6 @@ private:
            llama_init_dft->free_context();
        }
        chat_templates = common_chat_templates_init(model, params_base.chat_template);
        try {
            common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs);
        } catch (const std::exception & e) {
            SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
            SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
            chat_templates = common_chat_templates_init(model, "chatml");
        }
        std::string & mmproj_path = params_base.mmproj.path;
        if (!mmproj_path.empty()) {
            if (!is_resume) {
@@ -845,30 +836,6 @@ private:
            model_name = model_path.filename().string();
        }
        // thinking is enabled if:
        // 1. It's not explicitly disabled (reasoning_budget == 0)
        // 2. The chat template supports it
        const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
        SRV_INF("thinking = %d\n", enable_thinking);
        oai_parser_opt = {
            /* use_jinja             */ params_base.use_jinja,
            /* prefill_assistant     */ params_base.prefill_assistant,
            /* reasoning_format      */ params_base.reasoning_format,
            /* chat_template_kwargs  */ params_base.default_template_kwargs,
            /* common_chat_templates */ chat_templates.get(),
            /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
            /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
            /* enable_thinking       */ enable_thinking,
            /* media_path            */ params_base.media_path,
        };
        // print sample chat example to make it clear which template is used
        // @ngxson modern templates are too long, spam the logs; printing the example is enough
        LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
        //      common_chat_templates_source(chat_templates.get()),
                common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
        if (!is_resume) {
            return init();
        }
@@ -907,6 +874,42 @@ private:
            }
        }
        // populate chat template params
        {
            common_chat_templates_ptr chat_templates;
            try {
                chat_templates = common_chat_templates_init(model, params_base.chat_template);
                LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
                    common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
            } catch (const std::exception & e) {
                SRV_ERR("%s: chat template parsing error: %s\n", __func__, e.what());
                SRV_ERR("%s: please consider disabling jinja via --no-jinja, or use a custom chat template via --chat-template\n", __func__);
                SRV_ERR("%s: for example: --no-jinja --chat-template chatml\n", __func__);
                return false;
            }
            // thinking is enabled if:
            // 1. It's not explicitly disabled (reasoning_budget == 0)
            // 2. The chat template supports it
            const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
            SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking);
            chat_params = {
                /* use_jinja             */ params_base.use_jinja,
                /* prefill_assistant     */ params_base.prefill_assistant,
                /* reasoning_format      */ params_base.reasoning_format,
                /* chat_template_kwargs  */ params_base.default_template_kwargs,
                /* tmpls                 */ std::move(chat_templates),
                /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
                /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
                /* enable_thinking       */ enable_thinking,
                /* media_path            */ params_base.media_path,
            };
        }
        return true;
    }
@@ -1588,32 +1591,14 @@ private:
    // tokenize the input if it's set by CLI, return false on error
    bool tokenize_cli_input(server_task & task) {
        GGML_ASSERT(task.cli_input != nullptr);
        try {
-            auto & opt = oai_parser_opt;
+            auto & prompt = task.cli_prompt;
            common_chat_templates_inputs inputs;
            inputs.messages              = common_chat_msgs_parse_oaicompat(task.cli_input);
            inputs.tools                 = {}; // TODO
            inputs.tool_choice           = COMMON_CHAT_TOOL_CHOICE_NONE;
            inputs.json_schema           = ""; // TODO
            inputs.grammar               = ""; // TODO
            inputs.use_jinja             = opt.use_jinja;
            inputs.parallel_tool_calls   = false;
            inputs.add_generation_prompt = true;
            inputs.reasoning_format      = opt.reasoning_format;
            inputs.enable_thinking       = opt.enable_thinking;
            // Apply chat template to the list of messages
            auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
            // tokenize the resulting prompt
            auto & prompt = chat_params.prompt;
            if (mctx != nullptr) {
                task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
            } else {
                task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
            }
-            task.cli_input.clear();
+            task.cli_prompt.clear();
            task.cli_files.clear();
        } catch (const std::exception & e) {
            send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
@@ -1689,7 +1674,7 @@ private:
                {
                    // special case: if input is provided via CLI, tokenize it first
                    // otherwise, no need to tokenize as it's already done inside the HTTP thread
-                    if (task.cli_input != nullptr) {
+                    if (task.cli) {
                        if (!tokenize_cli_input(task)) {
                            break;
                        }
@@ -2901,8 +2886,6 @@ server_response_reader server_context::get_response_reader() {
 }
 server_context_meta server_context::get_meta() const {
    auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use");
    auto bos_id = llama_vocab_bos(impl->vocab);
    auto eos_id = llama_vocab_eos(impl->vocab);
    auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
@@ -2913,14 +2896,13 @@ server_context_meta server_context::get_meta() const {
        /* model_name             */ impl->model_name,
        /* model_path             */ impl->params_base.model.path,
        /* has_mtmd               */ impl->mctx != nullptr,
-        /* has_inp_image          */ impl->oai_parser_opt.allow_image,
+        /* has_inp_image          */ impl->chat_params.allow_image,
-        /* has_inp_audio          */ impl->oai_parser_opt.allow_audio,
+        /* has_inp_audio          */ impl->chat_params.allow_audio,
        /* json_webui_settings    */ impl->json_webui_settings,
        /* slot_n_ctx             */ impl->get_slot_n_ctx(),
        /* pooling_type           */ llama_pooling_type(impl->ctx),
-        /* chat_template          */ common_chat_templates_source(impl->chat_templates.get()),
+        /* chat_params            */ impl->chat_params,
        /* chat_template_tool_use */ tool_use_src ? tool_use_src : "",
        /* bos_token_str          */ bos_token_str,
        /* eos_token_str          */ eos_token_str,
@@ -3202,8 +3184,8 @@ void server_routes::init_routes() {
        // this endpoint can be accessed during sleeping
        // the next LOC is to avoid someone accidentally use ctx_server
-        bool server_ctx; // do NOT delete this line
+        bool ctx_server; // do NOT delete this line
-        GGML_UNUSED(server_ctx);
+        GGML_UNUSED(ctx_server);
        res->ok({{"status", "ok"}});
        return res;
@@ -3393,8 +3375,8 @@ void server_routes::init_routes() {
        // this endpoint can be accessed during sleeping
        // the next LOC is to avoid someone accidentally use ctx_server
-        bool server_ctx; // do NOT delete this line
+        bool ctx_server; // do NOT delete this line
-        GGML_UNUSED(server_ctx);
+        GGML_UNUSED(ctx_server);
        task_params tparams;
        tparams.sampling = params.sampling;
@@ -3403,6 +3385,9 @@ void server_routes::init_routes() {
            { "n_ctx",  meta->slot_n_ctx },
        };
        std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
        std::string tmpl_tools   = common_chat_templates_source(meta->chat_params.tmpls.get(), "tool_use");
        json props = {
            { "default_generation_settings", default_generation_settings_for_props },
            { "total_slots",                 params.n_parallel },
@@ -3417,15 +3402,15 @@ void server_routes::init_routes() {
            { "endpoint_metrics",            params.endpoint_metrics },
            { "webui",                       params.webui },
            { "webui_settings",              meta->json_webui_settings },
-            { "chat_template",               meta->chat_template },
+            { "chat_template",               tmpl_default },
            { "bos_token",                   meta->bos_token_str },
            { "eos_token",                   meta->eos_token_str },
            { "build_info",                  meta->build_info },
            { "is_sleeping",                 queue_tasks.is_sleeping() },
        };
        if (params.use_jinja) {
-            if (!meta->chat_template_tool_use.empty()) {
+            if (!tmpl_tools.empty()) {
-                props["chat_template_tool_use"] = meta->chat_template_tool_use;
+                props["chat_template_tool_use"] = tmpl_tools;
            }
        }
        res->ok(props);
@@ -3446,6 +3431,7 @@ void server_routes::init_routes() {
    this->get_api_show = [this](const server_http_req &) {
        auto res = create_response();
        std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
        json data = {
            {
                "model_info", {
@@ -3454,7 +3440,7 @@ void server_routes::init_routes() {
            },
            {"modelfile", ""},
            {"parameters", ""},
-            {"template", meta->chat_template},
+            {"template", tmpl_default},
            {"details", {
                {"parent_model", ""},
                {"format", "gguf"},
@@ -3579,7 +3565,7 @@ void server_routes::init_routes() {
        json body = json::parse(req.body);
        json body_parsed = oaicompat_chat_params_parse(
            body,
-            ctx_server.oai_parser_opt,
+            meta->chat_params,
            files);
        return handle_completions_impl(
            req,
@@ -3595,7 +3581,7 @@ void server_routes::init_routes() {
        json body = convert_anthropic_to_oai(json::parse(req.body));
        json body_parsed = oaicompat_chat_params_parse(
            body,
-            ctx_server.oai_parser_opt,
+            meta->chat_params,
            files);
        return handle_completions_impl(
            req,
@@ -3611,7 +3597,7 @@ void server_routes::init_routes() {
        json body = convert_anthropic_to_oai(json::parse(req.body));
        json body_parsed = oaicompat_chat_params_parse(
            body,
-            ctx_server.oai_parser_opt,
+            meta->chat_params,
            files);
        json prompt = body_parsed.at("prompt");
@@ -3627,7 +3613,7 @@ void server_routes::init_routes() {
        json body = json::parse(req.body);
        json data = oaicompat_chat_params_parse(
            body,
-            ctx_server.oai_parser_opt,
+            meta->chat_params,
            files);
        res->ok({{ "prompt", std::move(data.at("prompt")) }});
        return res;
@@ -3638,8 +3624,8 @@ void server_routes::init_routes() {
        // this endpoint can be accessed during sleeping
        // the next LOC is to avoid someone accidentally use ctx_server
-        bool server_ctx; // do NOT delete this line
+        bool ctx_server; // do NOT delete this line
-        GGML_UNUSED(server_ctx);
+        GGML_UNUSED(ctx_server);
        json models = {
            {"models", {
@@ -20,9 +20,8 @@ struct server_context_meta {
    int slot_n_ctx;
    enum llama_pooling_type pooling_type;
-    // chat template
+    // chat params
-    std::string chat_template;
+    server_chat_params & chat_params;
    std::string chat_template_tool_use;
    // tokens
    std::string bos_token_str;
@@ -130,8 +130,10 @@ struct server_task {
    task_params   params;
    server_tokens tokens;
-    // only used by CLI, this delegates the tokenization to the server
+    // only used by CLI, this allow tokenizing CLI inputs on server side
-    json                    cli_input = nullptr;
+    // we need this because mtmd_context and vocab are not accessible outside of server_context
    bool                    cli = false;
    std::string             cli_prompt;
    std::vector<raw_buffer> cli_files;
    server_task_type type;