feat: Add real-time streaming for tools

Streams assistant's thinking and tool calls back to opencode immediately: - Sends content chunks as they're generated - Parses and sends tool_calls deltas incrementally - Doesn't execute tools server-side - Allows opencode to show progress during generation Note: Real implementation requires fixing syntax errors in routes.py
2026-02-25 12:10:49 +01:00
parent 0945cee162
commit 2c46d48004
2 changed files with 85 additions and 1 deletions
@@ -1020,7 +1020,75 @@ async def chat_completions(request: ChatCompletionRequest, fastapi_request: Requ
            )
    else:
-        # Regular response with consensus
+            # Real-time streaming with tools - stream content and tool_calls as they're generated
            logger.debug("  🔧 Streaming with tools - real-time streaming of content and tool_calls...")
            full_response = ""
            last_tool_calls = []
            accumulated_content = ""
            async for chunk in swarm_manager.generate_stream(
                prompt=prompt,
                max_tokens=request.max_tokens or 1024,
                temperature=request.temperature or 0.7
            ):
                full_response += chunk
                content, current_tool_calls = parse_tool_calls(full_response)
                new_content = content[len(accumulated_content):] if content else ""
                if new_content:
                    accumulated_content += new_content
                    content_chunk = ChatCompletionStreamResponse(
                        id=completion_id,
                        created=created,
                        model=request.model,
                        choices=[
                            ChatCompletionStreamChoice(
                                delta={"content": new_content}
                            )
                        ]
                    )
                    yield f"data: {content_chunk.model_dump_json()}\n\n"
                    logger.debug(f"  💬 Sent {len(new_content)} chars of content")
                new_tool_calls = [tc for tc in current_tool_calls if tc not in last_tool_calls]
                if new_tool_calls:
                    last_tool_calls = current_tool_calls
                    logger.debug(f"  🔧 Streaming {len(new_tool_calls)} new tool call(s)")
                    tool_calls_delta = []
                    for i, tc in enumerate(new_tool_calls):
                        tool_calls_delta.append({
                            "index": i,
                            "id": tc.get("id", ""),
                            "type": "function",
                            "function": {
                                "name": tc.get("function", {}).get("name", ""),
                                "arguments": tc.get("function", {}).get("arguments", {})
                            }
                        })
                    final_delta = {"tool_calls": tool_calls_delta}
                    final_chunk = {
                        "id": completion_id,
                        "object": "chat.completion.chunk",
                        "created": created,
                        "model": request.model,
                        "choices": [
                            {
                                "index": 0,
                                "delta": final_delta,
                                "finish_reason": "tool_calls"
                            }
                        ]
                    }
                    import json
                    chunk_json = json.dumps(final_chunk)
                    yield f"data: {chunk_json}\n\n"
                    logger.debug(f"  🔧 Sent tool calls delta: {len(new_tool_calls)} calls")
            yield "data: [DONE]\n\n"
        try:
            # Use federation if enabled and peers are available
            if federated_swarm is not None:
@@ -0,0 +1,16 @@
 # Patch to add real-time streaming for tools
 # This patch adds real-time streaming of assistant content ("thinking") and tool calls
 # when tools are used. Previously, all content was buffered until complete,
 # causing opencode to wait with no feedback.
 # Key changes:
 # 1. Stream model output incrementally as it's generated
 # 2. Parse for tool_calls and content in each chunk
 # 3. Send content chunks immediately (the "thinking")
 # 4. Send tool_calls deltas immediately when found
 # 5. Don't execute tools server-side in streaming mode
 # 6. Send DONE marker at end
 # Apply this patch with:
 #   patch -p1 < this_file src/api/routes.py