server: rename --clear-idle to --cache-idle-slots (#21741)

2026-04-20 08:30:24 +03:00
parent e365e658f0
commit 9d49acb2a7
6 changed files with 16 additions and 16 deletions
@@ -167,7 +167,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)<br/>(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) |
 | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
 | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
-| `--clear-idle, --no-clear-idle` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)<br/>(env: LLAMA_ARG_CLEAR_IDLE) |
+| `--cache-idle-slots, --no-cache-idle-slots` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)<br/>(env: LLAMA_ARG_CACHE_IDLE_SLOTS) |
 | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
 | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
 | `-sp, --special` | special tokens output enabled (default: false) |
@@ -987,13 +987,13 @@ private:

        metrics.init();

-        if (params_base.clear_idle) {
+        if (params_base.cache_idle_slots) {
            if (!params_base.kv_unified) {
-                SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__);
-                params_base.clear_idle = false;
+                SRV_WRN("%s: --cache-idle-slots requires --kv-unified, disabling\n", __func__);
+                params_base.cache_idle_slots = false;
            } else if (params_base.cache_ram_mib == 0) {
-                SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__);
-                params_base.clear_idle = false;
+                SRV_WRN("%s: --cache-idle-slots requires --cache-ram, disabling\n", __func__);
+                params_base.cache_idle_slots = false;
            } else {
                SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
                SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
@@ -1886,7 +1886,7 @@ private:
                        break; // drop the task
                    }

-                    if (params_base.clear_idle) {
+                    if (params_base.cache_idle_slots) {
                        for (auto & s : slots) {
                            if (!s.is_processing()) {
                                slot_save_and_clear(s);
@@ -91,7 +91,7 @@ def test_clear_and_restore():

 def test_disabled_with_flag():
    global server
-    server.no_clear_idle = True
+    server.no_cache_idle_slots = True
    server.start()
    log = LogReader(server.log_path)

@@ -103,7 +103,7 @@ class ServerProcess:
    media_path: str | None = None
    sleep_idle_seconds: int | None = None
    cache_ram: int | None = None
-    no_clear_idle: bool = False
+    no_cache_idle_slots: bool = False
    log_path: str | None = None
    webui_mcp_proxy: bool = False

@@ -242,8 +242,8 @@ class ServerProcess:
            server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
        if self.cache_ram is not None:
            server_args.extend(["--cache-ram", self.cache_ram])
-        if self.no_clear_idle:
-            server_args.append("--no-clear-idle")
+        if self.no_cache_idle_slots:
+            server_args.append("--no-cache-idle-slots")
        if self.webui_mcp_proxy:
            server_args.append("--webui-mcp-proxy")