server: add auto-sleep after N seconds of idle (#18228)
* implement sleeping at queue level * implement server-context suspend * add test * add docs * optimization: add fast path * make sure to free llama_init * nits * fix use-after-free * allow /models to be accessed during sleeping, fix use-after-free * don't allow accessing /models during sleep, it is not thread-safe * fix data race on accessing props and model_meta * small clean up * trailing whitespace * rm outdated comments
This commit is contained in:
@@ -0,0 +1,39 @@
|
||||
import pytest
|
||||
import time
|
||||
from utils import *
|
||||
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def create_server():
|
||||
global server
|
||||
server = ServerPreset.tinyllama2()
|
||||
|
||||
|
||||
def test_server_sleep():
|
||||
global server
|
||||
server.sleep_idle_seconds = 1
|
||||
server.start()
|
||||
|
||||
# wait a bit so that server can go to sleep
|
||||
time.sleep(2)
|
||||
|
||||
# make sure these endpoints are still responsive after sleep
|
||||
res = server.make_request("GET", "/health")
|
||||
assert res.status_code == 200
|
||||
res = server.make_request("GET", "/props")
|
||||
assert res.status_code == 200
|
||||
assert res.body["is_sleeping"] == True
|
||||
|
||||
# make a generation request to wake up the server
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"n_predict": 1,
|
||||
"prompt": "Hello",
|
||||
})
|
||||
assert res.status_code == 200
|
||||
|
||||
# it should no longer be sleeping
|
||||
res = server.make_request("GET", "/props")
|
||||
assert res.status_code == 200
|
||||
assert res.body["is_sleeping"] == False
|
||||
@@ -100,6 +100,7 @@ class ServerProcess:
|
||||
server_path: str | None = None
|
||||
mmproj_url: str | None = None
|
||||
media_path: str | None = None
|
||||
sleep_idle_seconds: int | None = None
|
||||
|
||||
# session variables
|
||||
process: subprocess.Popen | None = None
|
||||
@@ -230,6 +231,8 @@ class ServerProcess:
|
||||
server_args.extend(["--mmproj-url", self.mmproj_url])
|
||||
if self.media_path:
|
||||
server_args.extend(["--media-path", self.media_path])
|
||||
if self.sleep_idle_seconds is not None:
|
||||
server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
|
||||
|
||||
args = [str(arg) for arg in [server_path, *server_args]]
|
||||
print(f"tests: starting server with: {' '.join(args)}")
|
||||
|
||||
Reference in New Issue
Block a user