server: add presets (config) when using multiple models (#17859)
* llama-server: recursive GGUF loading Replace flat directory scan with recursive traversal using std::filesystem::recursive_directory_iterator. Support for nested vendor/model layouts (e.g. vendor/model/*.gguf). Model name now reflects the relative path within --models-dir instead of just the filename. Aggregate files by parent directory via std::map before constructing local_model * server : router config POC (INI-based per-model settings) * server: address review feedback from @aldehir and @ngxson PEG parser usage improvements: - Simplify parser instantiation (remove arena indirection) - Optimize grammar usage (ws instead of zero_or_more, remove optional wrapping) - Fix last line without newline bug (+ operator instead of <<) - Remove redundant end position check Feature scope: - Remove auto-reload feature (will be separate PR per @ngxson) - Keep config.ini auto-creation and template generation - Preserve per-model customization logic Co-authored-by: aldehir <aldehir@users.noreply.github.com> Co-authored-by: ngxson <ngxson@users.noreply.github.com> * server: adopt aldehir's line-oriented PEG parser Complete rewrite of INI parser grammar and visitor: - Use p.chars(), p.negate(), p.any() instead of p.until() - Support end-of-line comments (key=value # comment) - Handle EOF without trailing newline correctly - Strict identifier validation ([a-zA-Z_][a-zA-Z0-9_.-]*) - Simplified visitor (no pending state, no trim needed) - Grammar handles whitespace natively via eol rule Business validation preserved: - Reject section names starting with LLAMA_ARG_* - Accept only keys starting with LLAMA_ARG_* - Require explicit section before key-value pairs Co-authored-by: aldehir <aldehir@users.noreply.github.com> * server: fix CLI/env duplication in child processes Children now receive minimal CLI args (executable, model, port, alias) instead of inheriting all router args. Global settings pass through LLAMA_ARG_* environment variables only, eliminating duplicate config warnings. Fixes: Router args like -ngl, -fa were passed both via CLI and env, causing 'will be overwritten' warnings on every child spawn * add common/preset.cpp * fix compile * cont * allow custom-path models * add falsey check * server: fix router model discovery and child process spawning - Sanitize model names: replace / and \ with _ for display - Recursive directory scan with relative path storage - Convert relative paths to absolute when spawning children - Filter router control args from child processes - Refresh args after port assignment for correct port value - Fallback preset lookup for compatibility - Fix missing argv[0]: store server binary path before base_args parsing * Revert "server: fix router model discovery and child process spawning" This reverts commit e3832b42eeea7fcb108995966c7584479f745857. * clarify about "no-" prefix * correct render_args() to include binary path * also remove arg LLAMA_ARG_MODELS_PRESET for child * add co-author for ini parser code Co-authored-by: aldehir <hello@alde.dev> * also set LLAMA_ARG_HOST * add CHILD_ADDR * Remove dead code --------- Co-authored-by: aldehir <aldehir@users.noreply.github.com> Co-authored-by: ngxson <ngxson@users.noreply.github.com> Co-authored-by: Xuan Son Nguyen <son@huggingface.co> Co-authored-by: aldehir <hello@alde.dev>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
#include "preset.h"
|
||||
#include "server-http.h"
|
||||
|
||||
#include <mutex>
|
||||
@@ -47,6 +48,7 @@ static std::string server_model_status_to_string(server_model_status status) {
|
||||
}
|
||||
|
||||
struct server_model_meta {
|
||||
common_preset preset;
|
||||
std::string name;
|
||||
std::string path;
|
||||
std::string path_mmproj; // only available if in_cache=false
|
||||
@@ -54,7 +56,7 @@ struct server_model_meta {
|
||||
int port = 0;
|
||||
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
|
||||
int64_t last_used = 0; // for LRU unloading
|
||||
std::vector<std::string> args; // additional args passed to the model instance (used for debugging)
|
||||
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
|
||||
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
|
||||
|
||||
bool is_active() const {
|
||||
@@ -66,6 +68,19 @@ struct server_model_meta {
|
||||
}
|
||||
};
|
||||
|
||||
// the server_presets struct holds the presets read from presets.ini
|
||||
// as well as base args from the router server
|
||||
struct server_presets {
|
||||
common_presets presets;
|
||||
common_params_context ctx_params;
|
||||
std::map<common_arg, std::string> base_args;
|
||||
std::map<std::string, common_arg> control_args; // args reserved for server control
|
||||
|
||||
server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir);
|
||||
common_preset get_preset(const std::string & name);
|
||||
void render_args(server_model_meta & meta);
|
||||
};
|
||||
|
||||
struct subprocess_s;
|
||||
|
||||
struct server_models {
|
||||
@@ -85,14 +100,21 @@ private:
|
||||
std::vector<std::string> base_args;
|
||||
std::vector<std::string> base_env;
|
||||
|
||||
server_presets presets;
|
||||
|
||||
void update_meta(const std::string & name, const server_model_meta & meta);
|
||||
|
||||
// unload least recently used models if the limit is reached
|
||||
void unload_lru();
|
||||
|
||||
// not thread-safe, caller must hold mutex
|
||||
void add_model(server_model_meta && meta);
|
||||
|
||||
public:
|
||||
server_models(const common_params & params, int argc, char ** argv, char ** envp);
|
||||
|
||||
void load_models();
|
||||
|
||||
// check if a model instance exists
|
||||
bool has_model(const std::string & name);
|
||||
|
||||
@@ -102,8 +124,7 @@ public:
|
||||
// return a copy of all model metadata
|
||||
std::vector<server_model_meta> get_all_meta();
|
||||
|
||||
// if auto_load is true, load the model with previous args if any
|
||||
void load(const std::string & name, bool auto_load);
|
||||
void load(const std::string & name);
|
||||
void unload(const std::string & name);
|
||||
void unload_all();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user