server: support load model on startup, support preset-only options (#18206)

* server: support autoload model, support preset-only options * add docs * load-on-startup * fix * Update common/arg.cpp Co-authored-by: Pascal <admin@serveurperso.com> --------- Co-authored-by: Pascal <admin@serveurperso.com>
2025-12-20 09:25:27 +01:00
parent 74e05131e9
commit 9e39a1e6a9
7 changed files with 80 additions and 10 deletions
@@ -1480,6 +1480,9 @@ The precedence rule for preset options is as follows:
 2. **Model-specific options** defined in the preset file (e.g. `[ggml-org/MY-MODEL...]`)
 3. **Global options** defined in the preset file (`[*]`)

+We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
+- `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
+
 ### Routing requests

 Requests are routed according to the requested model name.
@@ -226,6 +226,26 @@ void server_models::load_models() {
            SRV_INF("  %c %s\n", has_custom ? '*' : ' ', name.c_str());
        }
    }
+
+    // load any autoload models
+    std::vector<std::string> models_to_load;
+    for (const auto & [name, inst] : mapping) {
+        std::string val;
+        if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) {
+            models_to_load.push_back(name);
+        }
+    }
+    if ((int)models_to_load.size() > base_params.models_max) {
+        throw std::runtime_error(string_format(
+            "number of models to load on startup (%zu) exceeds models_max (%d)",
+            models_to_load.size(),
+            base_params.models_max
+        ));
+    }
+    for (const auto & name : models_to_load) {
+        SRV_INF("(startup) loading model %s\n", name.c_str());
+        load(name);
+    }
 }

 void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
@@ -103,27 +103,29 @@ public:

    void load_models();

-    // check if a model instance exists
+    // check if a model instance exists (thread-safe)
    bool has_model(const std::string & name);

-    // return a copy of model metadata
+    // return a copy of model metadata (thread-safe)
    std::optional<server_model_meta> get_meta(const std::string & name);

-    // return a copy of all model metadata
+    // return a copy of all model metadata (thread-safe)
    std::vector<server_model_meta> get_all_meta();

+    // load and unload model instances
+    // these functions are thread-safe
    void load(const std::string & name);
    void unload(const std::string & name);
    void unload_all();

-    // update the status of a model instance
+    // update the status of a model instance (thread-safe)
    void update_status(const std::string & name, server_model_status status);

-    // wait until the model instance is fully loaded
+    // wait until the model instance is fully loaded (thread-safe)
    // return when the model is loaded or failed to load
    void wait_until_loaded(const std::string & name);

-    // load the model if not loaded, otherwise do nothing
+    // load the model if not loaded, otherwise do nothing (thread-safe)
    // return false if model is already loaded; return true otherwise (meta may need to be refreshed)
    bool ensure_model_loaded(const std::string & name);