cli: new CLI experience (#17824)

* wip

* wip

* fix logging, add display info

* handle commands

* add args

* wip

* move old cli to llama-completion

* rm deprecation notice

* move server to a shared library

* move ci to llama-completion

* add loading animation

* add --show-timings arg

* add /read command, improve LOG_ERR

* add args for speculative decoding, enable show timings by default

* add arg --image and --audio

* fix windows build

* support reasoning_content

* fix llama2c workflow

* color default is auto

* fix merge conflicts

* properly fix color problem

Co-authored-by: bandoti <bandoti@users.noreply.github.com>

* better loading spinner

* make sure to clean color on force-exit

* also clear input files on "/clear"

* simplify common_log_flush

* add warning in mtmd-cli

* implement console writter

* fix data race

* add attribute

* fix llama-completion and mtmd-cli

* add some notes about console::log

* fix compilation

---------

Co-authored-by: bandoti <bandoti@users.noreply.github.com>
This commit is contained in:
Xuan-Son Nguyen
2025-12-10 15:28:59 +01:00
committed by GitHub
parent b677721819
commit 6c2131773c
26 changed files with 742 additions and 148 deletions
+51 -1
View File
@@ -1474,6 +1474,44 @@ struct server_context_impl {
// Functions to process the task
//
// tokenize the input if it's set by CLI, return false on error
bool tokenize_cli_input(server_task & task) {
if (task.cli_input == nullptr) {
return true; // nothing to do
}
try {
auto & opt = oai_parser_opt;
common_chat_templates_inputs inputs;
inputs.messages = common_chat_msgs_parse_oaicompat(task.cli_input);
inputs.tools = {}; // TODO
inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
inputs.json_schema = ""; // TODO
inputs.grammar = ""; // TODO
inputs.use_jinja = opt.use_jinja;
inputs.parallel_tool_calls = false;
inputs.add_generation_prompt = true;
inputs.reasoning_format = opt.reasoning_format;
inputs.enable_thinking = opt.enable_thinking;
// Apply chat template to the list of messages
auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
// tokenize the resulting prompt
auto & prompt = chat_params.prompt;
if (mctx != nullptr) {
task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
} else {
task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
}
task.cli_input.clear();
task.cli_files.clear();
} catch (const std::exception & e) {
send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
return false;
}
return true;
}
void process_single_task(server_task && task) {
switch (task.type) {
case SERVER_TASK_TYPE_COMPLETION:
@@ -1481,6 +1519,10 @@ struct server_context_impl {
case SERVER_TASK_TYPE_EMBEDDING:
case SERVER_TASK_TYPE_RERANK:
{
if (!tokenize_cli_input(task)) {
break;
}
const int id_slot = task.id_slot;
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
@@ -1690,7 +1732,6 @@ struct server_context_impl {
res->id = task.id;
queue_results.send(std::move(res));
} break;
}
}
@@ -2626,6 +2667,15 @@ server_response_reader server_context::get_response_reader() {
return impl->get_response_reader();
}
server_context_info server_context::get_info() const {
return server_context_info {
/* build_info */ build_info,
/* model_name */ impl->model_name,
/* has_inp_image */ impl->oai_parser_opt.allow_image,
/* has_inp_audio */ impl->oai_parser_opt.allow_audio,
};
}
// generator-like API for HTTP response generation