server: support multiple generations from one prompt (OAI "n" option) (#17775)

* backend support * server: support multiple generations from one prompt (OAI "n" option) * fix invalid batch * format oai * clean up * disable ctx shift * add test * update comments * fix style * add n_cmpl to docs [no ci] * allowing using both n_cmpl and n
2025-12-06 15:54:38 +01:00
parent 09c7c50e64
commit c42712b056
7 changed files with 146 additions and 19 deletions
@@ -53,6 +53,7 @@ struct task_params {
    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
    int32_t n_predict = -1; // new tokens to predict
    int32_t n_indent  =  0; // minimum line indentation for the generated text in number of whitespace characters
+    int32_t n_cmpl    =  1; // number of completions to generate from this prompt

    int64_t t_max_prompt_ms  = -1; // TODO: implement
    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
@@ -89,6 +90,10 @@ struct server_task {
    int id_target = -1;
    int id_slot   = -1;

+    // used by parallel sampling (multiple completions from same prompt)
+    size_t n_children =  0; // number of tasks reusing this prompt
+    int    id_parent  = -1;
+
    // used by SERVER_TASK_TYPE_INFERENCE
    task_params   params;
    server_tokens tokens;
@@ -130,6 +135,17 @@ struct server_task {
        }
        return ids;
    }
+
+    server_task create_child(int id_parent, int id_child, int idx) const {
+        server_task copy;
+        copy.id        = id_child;
+        copy.index     = idx;
+        copy.id_parent = id_parent;
+        copy.params    = params;
+        copy.type      = type;
+        copy.tokens    = tokens.clone();
+        return copy;
+    }
 };

 struct result_timings {
@@ -466,6 +482,14 @@ struct server_prompt {
    int n_tokens() const {
        return tokens.size();
    }
+
+    server_prompt clone() const {
+        return server_prompt {
+            tokens.clone(),
+            data,
+            checkpoints
+        };
+    }
 };

 struct server_prompt_cache {