diff --git a/src/api/routes.py b/src/api/routes.py index 12fa0bd..0ccad15 100644 --- a/src/api/routes.py +++ b/src/api/routes.py @@ -1020,7 +1020,75 @@ async def chat_completions(request: ChatCompletionRequest, fastapi_request: Requ ) else: - # Regular response with consensus + # Real-time streaming with tools - stream content and tool_calls as they're generated + logger.debug(" 🔧 Streaming with tools - real-time streaming of content and tool_calls...") + + full_response = "" + last_tool_calls = [] + accumulated_content = "" + + async for chunk in swarm_manager.generate_stream( + prompt=prompt, + max_tokens=request.max_tokens or 1024, + temperature=request.temperature or 0.7 + ): + full_response += chunk + + content, current_tool_calls = parse_tool_calls(full_response) + + new_content = content[len(accumulated_content):] if content else "" + if new_content: + accumulated_content += new_content + content_chunk = ChatCompletionStreamResponse( + id=completion_id, + created=created, + model=request.model, + choices=[ + ChatCompletionStreamChoice( + delta={"content": new_content} + ) + ] + ) + yield f"data: {content_chunk.model_dump_json()}\n\n" + logger.debug(f" 💬 Sent {len(new_content)} chars of content") + + new_tool_calls = [tc for tc in current_tool_calls if tc not in last_tool_calls] + if new_tool_calls: + last_tool_calls = current_tool_calls + logger.debug(f" 🔧 Streaming {len(new_tool_calls)} new tool call(s)") + + tool_calls_delta = [] + for i, tc in enumerate(new_tool_calls): + tool_calls_delta.append({ + "index": i, + "id": tc.get("id", ""), + "type": "function", + "function": { + "name": tc.get("function", {}).get("name", ""), + "arguments": tc.get("function", {}).get("arguments", {}) + } + }) + + final_delta = {"tool_calls": tool_calls_delta} + final_chunk = { + "id": completion_id, + "object": "chat.completion.chunk", + "created": created, + "model": request.model, + "choices": [ + { + "index": 0, + "delta": final_delta, + "finish_reason": "tool_calls" + } + ] + } + import json + chunk_json = json.dumps(final_chunk) + yield f"data: {chunk_json}\n\n" + logger.debug(f" 🔧 Sent tool calls delta: {len(new_tool_calls)} calls") + + yield "data: [DONE]\n\n" try: # Use federation if enabled and peers are available if federated_swarm is not None: diff --git a/streaming_patch.diff b/streaming_patch.diff new file mode 100644 index 0000000..b78774f --- /dev/null +++ b/streaming_patch.diff @@ -0,0 +1,16 @@ +# Patch to add real-time streaming for tools + +# This patch adds real-time streaming of assistant content ("thinking") and tool calls +# when tools are used. Previously, all content was buffered until complete, +# causing opencode to wait with no feedback. + +# Key changes: +# 1. Stream model output incrementally as it's generated +# 2. Parse for tool_calls and content in each chunk +# 3. Send content chunks immediately (the "thinking") +# 4. Send tool_calls deltas immediately when found +# 5. Don't execute tools server-side in streaming mode +# 6. Send DONE marker at end + +# Apply this patch with: +# patch -p1 < this_file src/api/routes.py