dcca89d89a
- Fixed ChatMessage.tool_calls to be Optional with default None (excluded when empty) - Added logprobs field to ChatCompletionChoice (always included as null) - Added stats and system_fingerprint to ChatCompletionResponse - Fixed streaming response to use delta format (not message format) - Fixed non-streaming response to include logprobs: null - Updated tool instructions to include 'NO explanations' - Added pytest-asyncio markers to async tests - All 41 tests passing This fixes the 'Cannot read properties of undefined (reading content)' error in hollama and ensures compatibility with OpenAI clients.
101 lines
3.2 KiB
Python
101 lines
3.2 KiB
Python
"""End-to-end test for tool execution with a mock server.
|
|
|
|
This tests the complete flow:
|
|
1. Model generates tool call
|
|
2. Tools are executed
|
|
3. Response is generated based on tool results
|
|
"""
|
|
|
|
import asyncio
|
|
import sys
|
|
import os
|
|
import pytest
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_tool_flow():
|
|
"""Test the tool execution flow end-to-end."""
|
|
|
|
# Import after path is set
|
|
from api.models import ChatMessage, ChatCompletionRequest
|
|
from api.tool_parser import parse_tool_calls
|
|
from api.formatting import format_messages_with_tools
|
|
from tools.executor import ToolExecutor
|
|
|
|
print("=" * 60)
|
|
print("End-to-End Tool Execution Test")
|
|
print("=" * 60)
|
|
|
|
# Test 1: Parse tool call from model response
|
|
print("\n1. Testing tool parsing...")
|
|
model_response = "TOOL: bash\nARGUMENTS: {\"command\": \"echo hello\"}"
|
|
|
|
content, tool_calls = parse_tool_calls(model_response)
|
|
assert tool_calls is not None, "Should parse tool call"
|
|
assert len(tool_calls) == 1, "Should have one tool call"
|
|
assert tool_calls[0]["function"]["name"] == "bash", "Should be bash tool"
|
|
print(f" ✓ Parsed tool: {tool_calls[0]['function']['name']}")
|
|
|
|
# Test 2: Simulate tool result and format for next prompt
|
|
print("\n2. Testing tool result formatting...")
|
|
tool_result = "hello\n"
|
|
|
|
# Build conversation history
|
|
messages = [
|
|
ChatMessage(role="user", content="Run echo hello"),
|
|
ChatMessage(role="assistant", content=model_response),
|
|
ChatMessage(role="tool", content=tool_result)
|
|
]
|
|
|
|
# Format for next generation
|
|
next_prompt = format_messages_with_tools(messages, None)
|
|
assert "tool" in next_prompt.lower(), "Prompt should include tool result"
|
|
assert "hello" in next_prompt, "Prompt should include tool output"
|
|
print(f" ✓ Tool result formatted for next prompt")
|
|
|
|
# Test 3: Verify loop detection
|
|
print("\n3. Testing loop detection...")
|
|
seen_tools = set()
|
|
|
|
# First tool call
|
|
tc1 = [{"function": {"name": "bash", "arguments": '{"command": "ls"}'}}]
|
|
sig1 = "bash:{'command': \"ls\"}'[:50]"
|
|
seen_tools.add(sig1)
|
|
print(f" ✓ First tool call tracked")
|
|
|
|
# Duplicate tool call
|
|
tc2 = tc1
|
|
sig2 = sig1
|
|
is_duplicate = sig2 in seen_tools
|
|
assert is_duplicate, "Should detect duplicate"
|
|
print(f" ✓ Duplicate tool call detected")
|
|
|
|
# Test 4: Verify tool result truncation
|
|
print("\n4. Testing tool result truncation...")
|
|
long_result = "a" * 3000
|
|
max_length = 2000
|
|
|
|
if len(long_result) > max_length:
|
|
truncated = long_result[:max_length] + "\n[...truncated...]"
|
|
assert len(truncated) == max_length + len("\n[...truncated...]"), "Should truncate properly"
|
|
print(f" ✓ Tool result truncated from {len(long_result)} to {len(truncated)} chars")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("All end-to-end tests passed!")
|
|
print("=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
asyncio.run(test_tool_flow())
|
|
except AssertionError as e:
|
|
print(f"\n❌ Test failed: {e}")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"\n❌ Test error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
sys.exit(1)
|