diff --git a/src/api/chat_handlers.py b/src/api/chat_handlers.py index b5814e5..a0bd51a 100644 --- a/src/api/chat_handlers.py +++ b/src/api/chat_handlers.py @@ -523,21 +523,14 @@ async def handle_chat_completion( logger.info(f" messages={len(request.messages)}") logger.info(f"{'='*60}") - # Use federation if available - if federated_swarm is not None: - peers = federated_swarm.discovery.get_peers() - if peers: - logger.info(f"🌐 Using federation with {len(peers)} peer(s)...") - content, tool_calls, finish_reason = await _generate_with_federation( - federated_swarm, prompt, request.max_tokens or 1024, request.temperature or 0.7 - ) - return _create_response(content, tool_calls, finish_reason, prompt, request, swarm_manager) - - - # Build conversation history messages = list(request.messages) + # Determine if we should use federation for generation + use_federation = federated_swarm is not None and len(federated_swarm.discovery.get_peers()) > 0 + if use_federation: + logger.info(f"🌐 Federation available with peers") + # Track thinking content for streaming (OpenCode reasoning_content) thinking_content: Optional[str] = None thinking_captured = False @@ -551,11 +544,22 @@ async def handle_chat_completion( iteration += 1 logger.info(f"--- Tool Execution Iteration {iteration} ---") - # Generate response + # Generate response (use federation if available) logger.debug(f"Generating response...") - response_text, tokens_generated, tps = await _generate_with_local_swarm( - swarm_manager, prompt, request.max_tokens or 1024, request.temperature or 0.7 - ) + if use_federation and iteration == 1: + # First iteration: use federation for consensus + logger.info(f"🌐 Using federation for generation...") + content, tool_calls, finish_reason = await _generate_with_federation( + federated_swarm, prompt, request.max_tokens or 1024, request.temperature or 0.7 + ) + response_text = content + tokens_generated = 0 # Will be calculated from usage if needed + tps = 0.0 + else: + # Subsequent iterations or no federation: use local swarm + response_text, tokens_generated, tps = await _generate_with_local_swarm( + swarm_manager, prompt, request.max_tokens or 1024, request.temperature or 0.7 + ) logger.info(f"Generated response ({len(response_text)} chars, {tokens_generated} tokens)") logger.debug(f"Response: {response_text[:200]}...")