"""End-to-end test for tool execution with a mock server. This tests the complete flow: 1. Model generates tool call 2. Tools are executed 3. Response is generated based on tool results """ import asyncio import sys import os import pytest sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) @pytest.mark.asyncio async def test_tool_flow(): """Test the tool execution flow end-to-end.""" # Import after path is set from api.models import ChatMessage, ChatCompletionRequest from api.tool_parser import parse_tool_calls from api.formatting import format_messages_with_tools from tools.executor import ToolExecutor print("=" * 60) print("End-to-End Tool Execution Test") print("=" * 60) # Test 1: Parse tool call from model response print("\n1. Testing tool parsing...") model_response = "TOOL: bash\nARGUMENTS: {\"command\": \"echo hello\"}" content, tool_calls = parse_tool_calls(model_response) assert tool_calls is not None, "Should parse tool call" assert len(tool_calls) == 1, "Should have one tool call" assert tool_calls[0]["function"]["name"] == "bash", "Should be bash tool" print(f" ✓ Parsed tool: {tool_calls[0]['function']['name']}") # Test 2: Simulate tool result and format for next prompt print("\n2. Testing tool result formatting...") tool_result = "hello\n" # Build conversation history messages = [ ChatMessage(role="user", content="Run echo hello"), ChatMessage(role="assistant", content=model_response), ChatMessage(role="tool", content=tool_result) ] # Format for next generation next_prompt = format_messages_with_tools(messages, None) assert "tool" in next_prompt.lower(), "Prompt should include tool result" assert "hello" in next_prompt, "Prompt should include tool output" print(f" ✓ Tool result formatted for next prompt") # Test 3: Verify loop detection print("\n3. Testing loop detection...") seen_tools = set() # First tool call tc1 = [{"function": {"name": "bash", "arguments": '{"command": "ls"}'}}] sig1 = "bash:{'command': \"ls\"}'[:50]" seen_tools.add(sig1) print(f" ✓ First tool call tracked") # Duplicate tool call tc2 = tc1 sig2 = sig1 is_duplicate = sig2 in seen_tools assert is_duplicate, "Should detect duplicate" print(f" ✓ Duplicate tool call detected") # Test 4: Verify tool result truncation print("\n4. Testing tool result truncation...") long_result = "a" * 3000 max_length = 2000 if len(long_result) > max_length: truncated = long_result[:max_length] + "\n[...truncated...]" assert len(truncated) == max_length + len("\n[...truncated...]"), "Should truncate properly" print(f" ✓ Tool result truncated from {len(long_result)} to {len(truncated)} chars") print("\n" + "=" * 60) print("All end-to-end tests passed!") print("=" * 60) if __name__ == "__main__": try: asyncio.run(test_tool_flow()) except AssertionError as e: print(f"\n❌ Test failed: {e}") sys.exit(1) except Exception as e: print(f"\n❌ Test error: {e}") import traceback traceback.print_exc() sys.exit(1)