server: add auto-sleep after N seconds of idle (#18228)
* implement sleeping at queue level * implement server-context suspend * add test * add docs * optimization: add fast path * make sure to free llama_init * nits * fix use-after-free * allow /models to be accessed during sleeping, fix use-after-free * don't allow accessing /models during sleep, it is not thread-safe * fix data race on accessing props and model_meta * small clean up * trailing whitespace * rm outdated comments
This commit is contained in:
@@ -22,9 +22,6 @@ struct server_context {
|
||||
server_context();
|
||||
~server_context();
|
||||
|
||||
// initialize slots and server-related data
|
||||
void init();
|
||||
|
||||
// load the model and initialize llama_context
|
||||
// returns true on success
|
||||
bool load_model(const common_params & params);
|
||||
@@ -35,7 +32,7 @@ struct server_context {
|
||||
// terminate main loop (will unblock start_loop)
|
||||
void terminate();
|
||||
|
||||
// get the underlaying llama_context
|
||||
// get the underlaying llama_context, can return nullptr if sleeping
|
||||
llama_context * get_llama_context() const;
|
||||
|
||||
// get a new response reader, used by CLI application
|
||||
|
||||
Reference in New Issue
Block a user