server: add auto-sleep after N seconds of idle (#18228)
* implement sleeping at queue level * implement server-context suspend * add test * add docs * optimization: add fast path * make sure to free llama_init * nits * fix use-after-free * allow /models to be accessed during sleeping, fix use-after-free * don't allow accessing /models during sleep, it is not thread-safe * fix data race on accessing props and model_meta * small clean up * trailing whitespace * rm outdated comments
This commit is contained in:
@@ -12,7 +12,10 @@
|
||||
struct server_queue {
|
||||
private:
|
||||
int id = 0;
|
||||
bool running;
|
||||
bool running = false;
|
||||
bool sleeping = false;
|
||||
bool req_stop_sleeping = false;
|
||||
int64_t time_last_task = 0;
|
||||
|
||||
// queues
|
||||
std::deque<server_task> queue_tasks;
|
||||
@@ -24,6 +27,7 @@ private:
|
||||
// callback functions
|
||||
std::function<void(server_task &&)> callback_new_task;
|
||||
std::function<void(void)> callback_update_slots;
|
||||
std::function<void(bool)> callback_sleeping_state;
|
||||
|
||||
public:
|
||||
// Add a new task to the end of the queue
|
||||
@@ -38,15 +42,18 @@ public:
|
||||
// Get the next id for creating a new task
|
||||
int get_new_id();
|
||||
|
||||
// Register function to process a new task
|
||||
void on_new_task(std::function<void(server_task &&)> callback);
|
||||
|
||||
// Register the function to be called when all slots data is ready to be processed
|
||||
void on_update_slots(std::function<void(void)> callback);
|
||||
|
||||
// Call when the state of one slot is changed, it will move one task from deferred to main queue
|
||||
void pop_deferred_task();
|
||||
|
||||
// if sleeping, request exiting sleep state and wait until it is done
|
||||
// returns immediately if not sleeping
|
||||
void wait_until_no_sleep();
|
||||
|
||||
bool is_sleeping() {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
return sleeping;
|
||||
}
|
||||
|
||||
// end the start_loop routine
|
||||
void terminate();
|
||||
|
||||
@@ -56,8 +63,15 @@ public:
|
||||
* - Process the task (i.e. maybe copy data into slot)
|
||||
* - Check if multitask is finished
|
||||
* - Update all slots
|
||||
*
|
||||
* Sleeping procedure (disabled if idle_sleep_ms < 0):
|
||||
* - If there is no task after idle_sleep_ms, enter sleeping state
|
||||
* - Call callback_sleeping_state(true)
|
||||
* - Wait until req_stop_sleeping is set to true
|
||||
* - Call callback_sleeping_state(false)
|
||||
* - Exit sleeping state
|
||||
*/
|
||||
void start_loop();
|
||||
void start_loop(int64_t idle_sleep_ms = -1);
|
||||
|
||||
// for metrics
|
||||
size_t queue_tasks_deferred_size() {
|
||||
@@ -65,6 +79,27 @@ public:
|
||||
return queue_tasks_deferred.size();
|
||||
}
|
||||
|
||||
//
|
||||
// Functions below are not thread-safe, must only be used before start_loop() is called
|
||||
//
|
||||
|
||||
// Register function to process a new task
|
||||
void on_new_task(std::function<void(server_task &&)> callback) {
|
||||
callback_new_task = std::move(callback);
|
||||
}
|
||||
|
||||
// Register the function to be called when all slots data is ready to be processed
|
||||
void on_update_slots(std::function<void(void)> callback) {
|
||||
callback_update_slots = std::move(callback);
|
||||
}
|
||||
|
||||
// Register callback for sleeping state change
|
||||
// note: when entering sleeping state, the callback is called AFTER sleeping is set to true
|
||||
// when leaving sleeping state, the callback is called BEFORE sleeping is set to false
|
||||
void on_sleeping_state(std::function<void(bool)> callback) {
|
||||
callback_sleeping_state = std::move(callback);
|
||||
}
|
||||
|
||||
private:
|
||||
void cleanup_pending_task(int id_target);
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user