* llama-server: add router multi-model tests (#17704) Add 4 test cases for model router: - test_router_unload_model: explicit model unloading - test_router_models_max_evicts_lru: LRU eviction with --models-max - test_router_no_models_autoload: --no-models-autoload flag behavior - test_router_api_key_required: API key authentication Tests use async model loading with polling and graceful skip when insufficient models available for eviction testing. utils.py changes: - Add models_max, models_dir, no_models_autoload attributes to ServerProcess - Handle JSONDecodeError for non-JSON error responses (fallback to text) * llama-server: update test models to new HF repos * add offline * llama-server: fix router LRU eviction test and add preloading Fix eviction test: load 2 models first, verify state, then load 3rd to trigger eviction. Previous logic loaded all 3 at once, causing first model to be evicted before verification could occur. Add module fixture to preload models via ServerPreset.load_all() and mark test presets as offline to use cached models * llama-server: fix split model download on Windows --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
This commit is contained in:
@@ -7,6 +7,7 @@ import subprocess
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
from json import JSONDecodeError
|
||||
import sys
|
||||
import requests
|
||||
import time
|
||||
@@ -83,6 +84,9 @@ class ServerProcess:
|
||||
pooling: str | None = None
|
||||
draft: int | None = None
|
||||
api_key: str | None = None
|
||||
models_dir: str | None = None
|
||||
models_max: int | None = None
|
||||
no_models_autoload: bool | None = None
|
||||
lora_files: List[str] | None = None
|
||||
enable_ctx_shift: int | None = False
|
||||
draft_min: int | None = None
|
||||
@@ -143,6 +147,10 @@ class ServerProcess:
|
||||
server_args.extend(["--hf-repo", self.model_hf_repo])
|
||||
if self.model_hf_file:
|
||||
server_args.extend(["--hf-file", self.model_hf_file])
|
||||
if self.models_dir:
|
||||
server_args.extend(["--models-dir", self.models_dir])
|
||||
if self.models_max is not None:
|
||||
server_args.extend(["--models-max", self.models_max])
|
||||
if self.n_batch:
|
||||
server_args.extend(["--batch-size", self.n_batch])
|
||||
if self.n_ubatch:
|
||||
@@ -204,6 +212,8 @@ class ServerProcess:
|
||||
server_args.extend(["--draft-min", self.draft_min])
|
||||
if self.no_webui:
|
||||
server_args.append("--no-webui")
|
||||
if self.no_models_autoload:
|
||||
server_args.append("--no-models-autoload")
|
||||
if self.jinja:
|
||||
server_args.append("--jinja")
|
||||
else:
|
||||
@@ -295,7 +305,13 @@ class ServerProcess:
|
||||
result = ServerResponse()
|
||||
result.headers = dict(response.headers)
|
||||
result.status_code = response.status_code
|
||||
result.body = response.json() if parse_body else None
|
||||
if parse_body:
|
||||
try:
|
||||
result.body = response.json()
|
||||
except JSONDecodeError:
|
||||
result.body = response.text
|
||||
else:
|
||||
result.body = None
|
||||
print("Response from server", json.dumps(result.body, indent=2))
|
||||
return result
|
||||
|
||||
@@ -434,8 +450,9 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def tinyllama2() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "tinyllamas/stories260K.gguf"
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/test-model-stories260K"
|
||||
server.model_hf_file = None
|
||||
server.model_alias = "tinyllama-2"
|
||||
server.n_ctx = 512
|
||||
server.n_batch = 32
|
||||
@@ -479,8 +496,8 @@ class ServerPreset:
|
||||
def tinyllama_infill() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
server.model_hf_file = "tinyllamas/stories260K-infill.gguf"
|
||||
server.model_hf_repo = "ggml-org/test-model-stories260K-infill"
|
||||
server.model_hf_file = None
|
||||
server.model_alias = "tinyllama-infill"
|
||||
server.n_ctx = 2048
|
||||
server.n_batch = 1024
|
||||
@@ -537,6 +554,7 @@ class ServerPreset:
|
||||
@staticmethod
|
||||
def router() -> ServerProcess:
|
||||
server = ServerProcess()
|
||||
server.offline = True # will be downloaded by load_all()
|
||||
# router server has no models
|
||||
server.model_file = None
|
||||
server.model_alias = None
|
||||
|
||||
Reference in New Issue
Block a user