local_swarm/tests/test_e2e_tool_flow.py

"""End-to-end test for tool execution with a mock server.

This tests the complete flow:
1. Model generates tool call
2. Tools are executed
3. Response is generated based on tool results
"""

import asyncio
import sys
import os
import pytest

sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))


@pytest.mark.asyncio
async def test_tool_flow():
    """Test the tool execution flow end-to-end."""

    # Import after path is set
    from api.models import ChatMessage, ChatCompletionRequest
    from api.tool_parser import parse_tool_calls
    from api.formatting import format_messages_with_tools
    from tools.executor import ToolExecutor

    print("=" * 60)
    print("End-to-End Tool Execution Test")
    print("=" * 60)

    # Test 1: Parse tool call from model response
    print("\n1. Testing tool parsing...")
    model_response = "TOOL: bash\nARGUMENTS: {\"command\": \"echo hello\"}"

    content, tool_calls = parse_tool_calls(model_response)
    assert tool_calls is not None, "Should parse tool call"
    assert len(tool_calls) == 1, "Should have one tool call"
    assert tool_calls[0]["function"]["name"] == "bash", "Should be bash tool"
    print(f"   ✓ Parsed tool: {tool_calls[0]['function']['name']}")

    # Test 2: Simulate tool result and format for next prompt
    print("\n2. Testing tool result formatting...")
    tool_result = "hello\n"

    # Build conversation history
    messages = [
        ChatMessage(role="user", content="Run echo hello"),
        ChatMessage(role="assistant", content=model_response),
        ChatMessage(role="tool", content=tool_result)
    ]

    # Format for next generation
    next_prompt = format_messages_with_tools(messages, None)
    assert "tool" in next_prompt.lower(), "Prompt should include tool result"
    assert "hello" in next_prompt, "Prompt should include tool output"
    print(f"   ✓ Tool result formatted for next prompt")

    # Test 3: Verify loop detection
    print("\n3. Testing loop detection...")
    seen_tools = set()

    # First tool call
    tc1 = [{"function": {"name": "bash", "arguments": '{"command": "ls"}'}}]
    sig1 = "bash:{'command': \"ls\"}'[:50]"
    seen_tools.add(sig1)
    print(f"   ✓ First tool call tracked")

    # Duplicate tool call
    tc2 = tc1
    sig2 = sig1
    is_duplicate = sig2 in seen_tools
    assert is_duplicate, "Should detect duplicate"
    print(f"   ✓ Duplicate tool call detected")

    # Test 4: Verify tool result truncation
    print("\n4. Testing tool result truncation...")
    long_result = "a" * 3000
    max_length = 2000

    if len(long_result) > max_length:
        truncated = long_result[:max_length] + "\n[...truncated...]"
        assert len(truncated) == max_length + len("\n[...truncated...]"), "Should truncate properly"
        print(f"   ✓ Tool result truncated from {len(long_result)} to {len(truncated)} chars")

    print("\n" + "=" * 60)
    print("All end-to-end tests passed!")
    print("=" * 60)


if __name__ == "__main__":
    try:
        asyncio.run(test_tool_flow())
    except AssertionError as e:
        print(f"\n❌ Test failed: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"\n❌ Test error: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)