cli: new CLI experience (#17824)
* wip * wip * fix logging, add display info * handle commands * add args * wip * move old cli to llama-completion * rm deprecation notice * move server to a shared library * move ci to llama-completion * add loading animation * add --show-timings arg * add /read command, improve LOG_ERR * add args for speculative decoding, enable show timings by default * add arg --image and --audio * fix windows build * support reasoning_content * fix llama2c workflow * color default is auto * fix merge conflicts * properly fix color problem Co-authored-by: bandoti <bandoti@users.noreply.github.com> * better loading spinner * make sure to clean color on force-exit * also clear input files on "/clear" * simplify common_log_flush * add warning in mtmd-cli * implement console writter * fix data race * add attribute * fix llama-completion and mtmd-cli * add some notes about console::log * fix compilation --------- Co-authored-by: bandoti <bandoti@users.noreply.github.com>
This commit is contained in:
@@ -1474,6 +1474,44 @@ struct server_context_impl {
|
||||
// Functions to process the task
|
||||
//
|
||||
|
||||
// tokenize the input if it's set by CLI, return false on error
|
||||
bool tokenize_cli_input(server_task & task) {
|
||||
if (task.cli_input == nullptr) {
|
||||
return true; // nothing to do
|
||||
}
|
||||
try {
|
||||
auto & opt = oai_parser_opt;
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.messages = common_chat_msgs_parse_oaicompat(task.cli_input);
|
||||
inputs.tools = {}; // TODO
|
||||
inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
inputs.json_schema = ""; // TODO
|
||||
inputs.grammar = ""; // TODO
|
||||
inputs.use_jinja = opt.use_jinja;
|
||||
inputs.parallel_tool_calls = false;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.reasoning_format = opt.reasoning_format;
|
||||
inputs.enable_thinking = opt.enable_thinking;
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
|
||||
|
||||
// tokenize the resulting prompt
|
||||
auto & prompt = chat_params.prompt;
|
||||
if (mctx != nullptr) {
|
||||
task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
|
||||
} else {
|
||||
task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
|
||||
}
|
||||
task.cli_input.clear();
|
||||
task.cli_files.clear();
|
||||
} catch (const std::exception & e) {
|
||||
send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void process_single_task(server_task && task) {
|
||||
switch (task.type) {
|
||||
case SERVER_TASK_TYPE_COMPLETION:
|
||||
@@ -1481,6 +1519,10 @@ struct server_context_impl {
|
||||
case SERVER_TASK_TYPE_EMBEDDING:
|
||||
case SERVER_TASK_TYPE_RERANK:
|
||||
{
|
||||
if (!tokenize_cli_input(task)) {
|
||||
break;
|
||||
}
|
||||
|
||||
const int id_slot = task.id_slot;
|
||||
|
||||
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
|
||||
@@ -1690,7 +1732,6 @@ struct server_context_impl {
|
||||
res->id = task.id;
|
||||
queue_results.send(std::move(res));
|
||||
} break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2626,6 +2667,15 @@ server_response_reader server_context::get_response_reader() {
|
||||
return impl->get_response_reader();
|
||||
}
|
||||
|
||||
server_context_info server_context::get_info() const {
|
||||
return server_context_info {
|
||||
/* build_info */ build_info,
|
||||
/* model_name */ impl->model_name,
|
||||
/* has_inp_image */ impl->oai_parser_opt.allow_image,
|
||||
/* has_inp_audio */ impl->oai_parser_opt.allow_audio,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
|
||||
// generator-like API for HTTP response generation
|
||||
|
||||
Reference in New Issue
Block a user