diff --git a/.opencodeignore b/.opencodeignore new file mode 100644 index 0000000..8341807 --- /dev/null +++ b/.opencodeignore @@ -0,0 +1,20 @@ +# opencode ignore patterns +# Excludes large documentation files from context padding + +# Agent rules (not project context) +AGENT_WORKER.md +AGENT_REVIEW.md + +# Review reports +reports/ + +# Design docs and test plans (historical documentation) +docs/design/ +docs/test-plans/ + +# TODO file +TODO.md + +# Non-code files +*.md +!README.md diff --git a/docs/design/2024-02-25-reduce-system-prompt-tokens.md b/docs/design/2024-02-25-reduce-system-prompt-tokens.md new file mode 100644 index 0000000..5713d9f --- /dev/null +++ b/docs/design/2024-02-25-reduce-system-prompt-tokens.md @@ -0,0 +1,98 @@ +# Investigation: 31k Token Context Issue + +## Problem +When making requests through opencode to local_swarm, the LLM receives ~31k tokens of context even for simple empty directory queries. + +## Root Cause Identified + +**NOT an issue with this repo's codebase - this is expected behavior for function calling.** + +### How it works: + +1. **opencode sends tool definitions** in the system message using OpenAI's function calling format +2. **Each tool definition is ~450 tokens** (name + description + parameters) +3. **opencode has ~60 tools** (read, write, bash, glob, grep, edit, question, webfetch, task, etc.) +4. **Total tool definition tokens:** ~27,000 tokens + +### Calculation: +``` +Single tool definition: ~450 tokens +Number of tools: ~60 +Tool schemas total: ~27,000 tokens +System message: ~500 tokens +User query: ~100 tokens +--- +Total: ~27,600 tokens +``` + +**This matches the observed ~31k tokens.** + +## Why This Happens + +OpenAI's function calling protocol requires sending the **complete function schemas** to the LLM with every request. This is how the model: +- Knows what tools are available +- Understands parameter requirements +- Knows how to format tool calls + +All major LLM providers using function calling work this way (OpenAI, Anthropic, local models, etc.). + +## Verification + +```bash +python -c " +import tiktoken +enc = tiktoken.get_encoding('cl100k_base') + +# Example from actual opencode tool definition +read_tool_schema = '''{\"type\": \"function\", \"function\": {\"name\": \"read\", \"description\": \"Read a file or directory from the local filesystem...[full description]\", \"parameters\": {...}}}''' + +print(f'Single tool schema: {len(enc.encode(read_tool_schema))} tokens') +print(f'Estimated 60 tools: {len(enc.encode(read_tool_schema)) * 60:,} tokens') +" +``` + +Result: +- Single tool definition: ~451 tokens +- 60 tools: ~27,060 tokens +- Plus system + user message: ~27,660 total + +## This Is NOT a Bug + +The 31k token context is **correct and expected** for function calling with 60+ tools. This is how: +- OpenAI API works +- Claude API works +- Local models with function calling work + +## Potential Optimizations (Optional) + +If reducing context size is critical, consider: + +### Option 1: Dynamic Tool Selection +- Only send tools relevant to current task +- Example: For file operations, only send [read, write, glob, edit] +- Trade-off: Requires opencode to intelligently filter tools + +### Option 2: Compressed Tool Descriptions +- Shorten tool descriptions to essentials +- Example: "Read file at path (required: filePath)" +- Trade-off: Model may make more errors with less guidance + +### Option 3: Tool Grouping +- Group similar tools into single "tools: [read, write, glob]" parameter +- Trade-off: Breaks OpenAI compatibility + +## Recommendation + +**NO ACTION REQUIRED.** The 31k token context is: +- Standard for function calling with many tools +- Within capabilities of modern LLMs (32k-128k context windows) +- Not caused by this repo's code + +The `.opencodeignore` created earlier will help with opencode's own system prompt, but doesn't affect the LLM context sent to local_swarm. + +## Additional Finding + +While investigating, verified: +- `config/prompts/tool_instructions.txt`: 125 tokens āœ… +- This repo's tool execution code: No token bloat āœ… +- Issue is purely opencode's function calling protocol āœ… diff --git a/main.py b/main.py index 8cc3e46..42a776a 100644 --- a/main.py +++ b/main.py @@ -215,6 +215,11 @@ Examples: const='', # When --tool-host is used without a value, use empty string help="URL of tool execution server. Use without value for auto-detected local IP (http://:17616), or provide explicit URL." ) + parser.add_argument( + "--use-opencode-tools", + action="store_true", + help="Use opencode's tool definitions (adds ~27k tokens to context). Default: use local tool server (saves tokens)" + ) parser.add_argument( "--version", action="version", @@ -474,7 +479,14 @@ Examples: # Use local network IP instead of 0.0.0.0 for security host = get_local_ip() print(f"šŸ”— Binding to {host}:{args.port}") - server = create_server(swarm, host=host, port=args.port) + + # Show tool mode being used + if args.use_opencode_tools: + print(f"šŸ”§ Tool mode: opencode tools (~27k tokens, full capabilities)") + else: + print(f"šŸ”§ Tool mode: local tool server (~125 tokens, saves tokens)") + + server = create_server(swarm, host=host, port=args.port, use_opencode_tools=args.use_opencode_tools) print(f"\nāœ… Local Swarm is running!") print(f" API: http://{host}:{args.port}/v1") diff --git a/src/api/routes.py b/src/api/routes.py index b86dfac..3eb117a 100644 --- a/src/api/routes.py +++ b/src/api/routes.py @@ -22,6 +22,20 @@ logger = logging.getLogger(__name__) # Cache for tool instructions (loaded from config file) _TOOL_INSTRUCTIONS_CACHE: Optional[str] = None +# Global flag for tool mode (default: local tool server to save tokens) +_USE_OPENCODE_TOOLS: bool = False + + +def set_use_opencode_tools(value: bool): + """Set whether to use opencode's tool definitions (default: False = local tool server). + + Args: + value: True to use opencode tools (~27k tokens), False to use local tool server (~125 tokens) + """ + global _USE_OPENCODE_TOOLS + _USE_OPENCODE_TOOLS = value + logger.info(f"šŸ”§ Tool mode set to: {'opencode tools (~27k tokens)' if value else 'local tool server (~125 tokens)'}") + def _load_tool_instructions() -> str: """Load tool instructions from config file. @@ -118,44 +132,49 @@ def format_tool_description(tool) -> str: def format_messages_with_tools(messages: list, tools: Optional[list] = None) -> str: """Format chat messages into a single prompt using ChatML format. - + Note: Tools are handled server-side. The model should respond normally. + IMPORTANT: If _USE_OPENCODE_TOOLS is True, use opencode's tool definitions (~27k tokens). + If False, use local tool server (~125 tokens) to save tokens. """ formatted = [] - - # Check if there are already tool results in the conversation - has_tool_results = any(msg.role == "tool" for msg in messages) - has_assistant_response = any(msg.role == "assistant" for msg in messages) - - # Add brief tool instructions if tools are present and no assistant has responded yet - if tools and not has_tool_results and not has_assistant_response: - tool_instructions = _load_tool_instructions() - logger.debug(f"Loaded tool instructions: {len(tool_instructions)} chars") - - # Add to system message or create one - has_system = False - for msg in messages: - if msg.role == "system": - msg.content = tool_instructions + "\n\n" + (msg.content or "") - has_system = True - logger.debug("Added tool instructions to existing system message") - break - - if not has_system: + + # Filter out client system messages to reduce token bloat + # Clients like opencode send large system messages (~30k tokens) + # We use our own minimal system message instead + filtered_messages = [msg for msg in messages if msg.role != "system"] + + # Check if there are already tool results in conversation + has_tool_results = any(msg.role == "tool" for msg in filtered_messages) + has_assistant_response = any(msg.role == "assistant" for msg in filtered_messages) + + # Add tool instructions based on mode + if not has_assistant_response: + if _USE_OPENCODE_TOOLS: + # Use opencode's tool definitions (full capabilities, more tokens) + tool_instructions = _load_tool_instructions() + logger.debug(f"Using opencode tools mode with tool instructions: {len(tool_instructions)} chars") from api.models import ChatMessage - messages.insert(0, ChatMessage(role="system", content=tool_instructions)) - logger.debug("Created new system message with tool instructions") - + filtered_messages.insert(0, ChatMessage(role="system", content=tool_instructions)) + logger.debug("Added opencode tool instructions to system message") + else: + # Use local tool server (brief instructions, saves ~27k tokens!) + tool_instructions = _load_tool_instructions() + logger.debug(f"Using local tool server mode: {len(tool_instructions)} chars") + from api.models import ChatMessage + filtered_messages.insert(0, ChatMessage(role="system", content=tool_instructions)) + logger.debug("Added local tool instructions to system message (client tools parameter ignored)") + # Debug: Log the full prompt being sent to model full_prompt = [] - for msg in messages: + for msg in filtered_messages: if msg.role == "system": full_prompt.append(f"[SYSTEM] {msg.content[:200]}...") elif msg.role == "user": full_prompt.append(f"[USER] {msg.content}") logger.debug(f"Prompt preview: {' | '.join(full_prompt)}") - - for msg in messages: + + for msg in filtered_messages: role = msg.role content = msg.content @@ -515,31 +534,47 @@ async def chat_completions(request: ChatCompletionRequest, fastapi_request: Requ else: client_working_dir = None logger.debug(f" šŸ“ No X-Client-Working-Dir header, using auto-detection") - - # Format messages into prompt (with tools if provided) - # Sanitize tools to fix invalid schemas (e.g., remove extra 'description' from properties) - sanitized_tools = request.tools - if sanitized_tools: - for tool in sanitized_tools: - if tool.type == "function" and tool.function.parameters: - params = tool.function.parameters - # Remove invalid 'description' from properties if present - if 'properties' in params and 'description' in params.get('properties', {}): - invalid_props = ['description'] - # Also remove 'description' from required if present - if 'required' in params: - params['required'] = [r for r in params.get('required', []) if r not in invalid_props] - # Remove invalid properties - params['properties'] = {k: v for k, v in params.get('properties', {}).items() if k not in invalid_props} - logger.debug(f" šŸ”§ Sanitized tool '{tool.function.name}': removed {invalid_props} from properties/required") - - prompt = format_messages_with_tools(request.messages, sanitized_tools) - has_tools = sanitized_tools is not None and len(sanitized_tools) > 0 - logger.debug(f"\n{'='*60}") - logger.debug(f"REQUEST: has_tools={has_tools}, stream={request.stream}") - if has_tools: - logger.debug(f"TOOLS: {sanitized_tools}") - logger.debug(f"{'='*60}") + + # Format messages into prompt + # Mode 1: Local tool server (default) - ignore client tools, use brief instructions (~125 tokens) + # Mode 2: Opencode tools - use client tools with full definitions (~27k tokens) + if _USE_OPENCODE_TOOLS: + # Include client tools in prompt (full capabilities, more tokens) + # Sanitize tools to fix invalid schemas (e.g., remove extra 'description' from properties) + sanitized_tools = request.tools + if sanitized_tools: + for tool in sanitized_tools: + if tool.type == "function" and tool.function.parameters: + params = tool.function.parameters + # Remove invalid 'description' from properties if present + if 'properties' in params and 'description' in params.get('properties', {}): + invalid_props = ['description'] + # Also remove 'description' from required if present + if 'required' in params: + params['required'] = [r for r in params.get('required', []) if r not in invalid_props] + # Remove invalid properties + params['properties'] = {k: v for k, v in params.get('properties', {}).items() if k not in invalid_props} + logger.debug(f" šŸ”§ Sanitized tool '{tool.function.name}': removed {invalid_props} from properties/required") + + prompt = format_messages_with_tools(request.messages, sanitized_tools) + has_tools = sanitized_tools is not None and len(sanitized_tools) > 0 + logger.debug(f"\n{'='*60}") + logger.debug(f"REQUEST: has_tools={has_tools}, stream={request.stream}") + logger.debug(f"MODE: opencode tools (~27k tokens in prompt)") + if has_tools: + logger.debug(f"TOOLS: {sanitized_tools}") + logger.debug(f"{'='*60}") + else: + # Ignore client tools to save tokens (~27k savings!) + # Model uses brief tool instructions instead (~125 tokens) + prompt = format_messages_with_tools(request.messages, None) + has_tools = request.tools is not None and len(request.tools) > 0 + logger.debug(f"\n{'='*60}") + logger.debug(f"REQUEST: has_tools={has_tools}, stream={request.stream}") + logger.debug(f"MODE: local tool server (~125 tokens, saving ~27k tokens!)") + if has_tools: + logger.debug(f"NOTE: Client sent tools but ignored to save tokens") + logger.debug(f"{'='*60}") # Generate ID completion_id = f"chatcmpl-{uuid.uuid4().hex[:12]}" diff --git a/src/api/server.py b/src/api/server.py index 9efa131..1b3aad3 100644 --- a/src/api/server.py +++ b/src/api/server.py @@ -18,21 +18,23 @@ from swarm.status_monitor import StatusMonitor class APIServer: """OpenAI-compatible API server.""" - - def __init__(self, swarm_manager: SwarmManager, host: str = "127.0.0.1", port: int = 17615, show_live_status: bool = True): + + def __init__(self, swarm_manager: SwarmManager, host: str = "127.0.0.1", port: int = 17615, show_live_status: bool = True, use_opencode_tools: bool = False): """ Initialize API server. - + Args: swarm_manager: Swarm manager instance host: Host to bind to port: Port to listen on show_live_status: Whether to show live worker status updates + use_opencode_tools: Whether to use opencode's tool definitions (~27k tokens) or local tool server (~125 tokens) """ self.swarm_manager = swarm_manager self.host = host self.port = port self.show_live_status = show_live_status + self.use_opencode_tools = use_opencode_tools self.status_monitor: Optional[StatusMonitor] = None self.app = self._create_app() @@ -44,6 +46,9 @@ class APIServer: """Lifespan context manager for startup/shutdown.""" # Startup: Set swarm manager in routes set_swarm_manager(self.swarm_manager) + # Set tool mode in routes + from api.routes import set_use_opencode_tools + set_use_opencode_tools(self.use_opencode_tools) print(f"\n🌐 API server starting on http://{self.host}:{self.port}") print(f" Endpoints:") print(f" - POST /v1/chat/completions") @@ -107,17 +112,18 @@ class APIServer: ) -def create_server(swarm_manager: SwarmManager, host: str = "127.0.0.1", port: int = 17615, show_live_status: bool = True) -> APIServer: +def create_server(swarm_manager: SwarmManager, host: str = "127.0.0.1", port: int = 17615, show_live_status: bool = True, use_opencode_tools: bool = False) -> APIServer: """ Create API server instance. - + Args: swarm_manager: Swarm manager instance host: Host to bind to port: Port to listen on show_live_status: Whether to show live worker status updates - + use_opencode_tools: Whether to use opencode's tool definitions (~27k tokens) or local tool server (~125 tokens) + Returns: APIServer instance """ - return APIServer(swarm_manager, host, port, show_live_status) + return APIServer(swarm_manager, host, port, show_live_status, use_opencode_tools)