server: improve slots scheduling for n_cmpl (#18789)

* server : make sure children tasks are scheduled to launch with parent

* fix

* add comment pointing to this PR

* fix

* clean up

* more debug messages

* add pop_deferred_task with specific ID version

* improve the logic

* simple approach

* no double move

* correct return type of launch_slots_with_parent_task
This commit is contained in:
Xuan-Son Nguyen
2026-01-15 17:10:28 +01:00
committed by GitHub
parent 39173bcacb
commit a04c2b06a3
5 changed files with 194 additions and 103 deletions
+23 -3
View File
@@ -121,8 +121,10 @@ struct server_task {
int id_slot = -1;
// used by parallel sampling (multiple completions from same prompt)
int n_children = 0; // number of tasks reusing this prompt
int id_parent = -1;
// temporary store of child tasks for scheduling
// note: accessing to elements is invalid after the task is moved to server_slot
std::vector<server_task> child_tasks;
// used by SERVER_TASK_TYPE_INFERENCE
task_params params;
@@ -197,11 +199,14 @@ struct server_task {
std::unordered_set<int> ids(tasks.size());
for (size_t i = 0; i < tasks.size(); i++) {
ids.insert(tasks[i].id);
for (auto & child : tasks[i].child_tasks) {
ids.insert(child.id);
}
}
return ids;
}
server_task create_child(int id_parent, int id_child) const {
void add_child(int id_parent, int id_child) {
server_task copy;
copy.id = id_child;
@@ -209,8 +214,15 @@ struct server_task {
copy.params = params;
copy.type = type;
copy.tokens = tokens.clone();
copy.id_slot = -1; // child tasks cannot specify slot
return copy;
// use different sampling seed for each child
// note: https://github.com/ggml-org/llama.cpp/pull/18700#discussion_r2675115723
if (copy.params.sampling.seed != LLAMA_DEFAULT_SEED) {
copy.params.sampling.seed += (uint32_t)child_tasks.size() + 1;
}
child_tasks.push_back(std::move(copy));
}
// the task will be moved into queue, then onto slots
@@ -218,6 +230,14 @@ struct server_task {
task_result_state create_state() const {
return task_result_state(params.oaicompat_chat_syntax);
}
bool is_parent() const {
return child_tasks.size() > 0;
}
bool is_child() const {
return id_parent != -1;
}
};
struct result_timings {