server : add thinking content blocks to Anthropic Messages API (#18551)

* server : add thinking content blocks to Anthropic Messages API Add support for returning reasoning/thinking content in Anthropic API responses when using models with --reasoning-format deepseek and the thinking parameter enabled. - Non-streaming: adds thinking block before text in content array - Streaming: emits thinking_delta events with correct block indices - Partial streaming: tracks reasoning state across chunks via anthropic_has_reasoning member variable Tested with bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF model. * server : fix Anthropic API streaming for thinking content blocks Add signature field and fix duplicate content_block_start events in Anthropic Messages API streaming responses for reasoning models. * server: refactor Anthropic streaming state to avoid raw pointer Replace raw pointer to task_result_state with direct field copies: - Copy state fields in update() before processing chunk - Use local copies in to_json_anthropic() instead of dereferencing - Pre-compute state updates for next chunk in update() This makes the data flow clearer and avoids unsafe pointer patterns.
2026-01-06 16:17:13 +01:00
parent bd2a93d475
commit 3d26a09dc7
3 changed files with 246 additions and 18 deletions
@@ -96,6 +96,10 @@ struct task_result_state {
    std::string generated_text; // append new chunks of generated text here
    std::vector<std::string> generated_tool_call_ids;

+    // for Anthropic API streaming: track content block state across chunks
+    bool anthropic_thinking_block_started = false;
+    bool anthropic_text_block_started = false;
+
    task_result_state(const common_chat_syntax & oaicompat_chat_syntax)
        : oaicompat_chat_syntax(oaicompat_chat_syntax) {}

@@ -337,6 +341,12 @@ struct server_task_result_cmpl_partial : server_task_result {
    std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
    bool is_updated = false;

+    // for Anthropic API: track if any reasoning content has been generated
+    bool anthropic_has_reasoning = false;
+    // Streaming state copied from task_result_state for this chunk
+    bool anthropic_thinking_block_started = false;
+    bool anthropic_text_block_started = false;
+
    virtual bool is_stop() override {
        return false; // in stream mode, partial responses are not considered stop
    }
@@ -346,6 +356,22 @@ struct server_task_result_cmpl_partial : server_task_result {
    virtual void update(task_result_state & state) override {
        is_updated = true;
        state.update_chat_msg(content, true, oaicompat_msg_diffs);
+        // track if the accumulated message has any reasoning content
+        anthropic_has_reasoning = !state.chat_msg.reasoning_content.empty();
+
+        // Copy current state for use in to_json_anthropic() (reflects state BEFORE this chunk)
+        anthropic_thinking_block_started = state.anthropic_thinking_block_started;
+        anthropic_text_block_started = state.anthropic_text_block_started;
+
+        // Pre-compute state updates based on diffs (for next chunk)
+        for (const auto & diff : oaicompat_msg_diffs) {
+            if (!diff.reasoning_content_delta.empty() && !state.anthropic_thinking_block_started) {
+                state.anthropic_thinking_block_started = true;
+            }
+            if (!diff.content_delta.empty() && !state.anthropic_text_block_started) {
+                state.anthropic_text_block_started = true;
+            }
+        }
    }

    json to_json_non_oaicompat();