Compare commits
49 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 907bd88c8f | |||
| af728505e8 | |||
| 93844a81b0 | |||
| 414cb444f3 | |||
| 34b28597ff | |||
| 67122052b4 | |||
| e7b826da4e | |||
| 3799240d74 | |||
| e0d04ae664 | |||
| 896e9d6d9b | |||
| e2b0af7636 | |||
| 5b29e15c0a | |||
| 8431717235 | |||
| 06df3c8dab | |||
| ab7cf7e9aa | |||
| 49a6d99bf8 | |||
| 586c113688 | |||
| a09d23156b | |||
| c46684f03e | |||
| bd3579737a | |||
| 886ebbdb81 | |||
| a0d3ae9d4f | |||
| a0571c83a3 | |||
| 46f14b2b53 | |||
| 42a176f1d8 | |||
| dcca89d89a | |||
| b9ce5db8ef | |||
| 1acebbc6a2 | |||
| 32049c766c | |||
| a82c73d05d | |||
| bdc8db9678 | |||
| 0134ccae53 | |||
| 4ea36783d6 | |||
| 6ab726b46c | |||
| d22c52ec04 | |||
| 5fa8cd4e0e | |||
| 2c46d48004 | |||
| 0945cee162 | |||
| 58e4b2c645 | |||
| 929f069d14 | |||
| bdcb013d6b | |||
| 9fdc3a6d02 | |||
| c18c20487c | |||
| 1d1d7b4468 | |||
| 4f2b9252c4 | |||
| 3dbc76de04 | |||
| af2d616f76 | |||
| 1ac32c7ec3 | |||
| d33fa406b6 |
@@ -0,0 +1,20 @@
|
||||
# opencode ignore patterns
|
||||
# Excludes large documentation files from context padding
|
||||
|
||||
# Agent rules (not project context)
|
||||
AGENT_WORKER.md
|
||||
AGENT_REVIEW.md
|
||||
|
||||
# Review reports
|
||||
reports/
|
||||
|
||||
# Design docs and test plans (historical documentation)
|
||||
docs/design/
|
||||
docs/test-plans/
|
||||
|
||||
# TODO file
|
||||
TODO.md
|
||||
|
||||
# Non-code files
|
||||
*.md
|
||||
!README.md
|
||||
@@ -84,7 +84,64 @@ def test_parse_simple_tool():
|
||||
# Then write minimal code to pass
|
||||
```
|
||||
|
||||
### Rule 3: No Production Debugging
|
||||
### Rule 3: Minimal, Maintainable, Modular Code
|
||||
**Core Focus:** Keep code minimal, maintainable, and modular.
|
||||
|
||||
#### Minimal
|
||||
- Write only the code needed to solve the problem
|
||||
- Avoid unnecessary abstractions or over-engineering
|
||||
- Keep functions small and focused (max 50 lines)
|
||||
- Prefer simple solutions over complex ones
|
||||
- Remove dead code and unused imports immediately
|
||||
|
||||
#### Maintainable
|
||||
- Clear, descriptive variable and function names
|
||||
- One concept per file/module
|
||||
- Self-documenting code with minimal comments
|
||||
- Consistent code style throughout
|
||||
- Easy to understand for future maintainers
|
||||
|
||||
#### Modular
|
||||
- Single Responsibility Principle: One purpose per module/function
|
||||
- Loose coupling between components
|
||||
- Clear, stable interfaces between modules
|
||||
- Easy to test in isolation
|
||||
- Reusable components where appropriate
|
||||
|
||||
```python
|
||||
# BAD: Monolithic, complex, hard to maintain
|
||||
def process_user_request(request_data, validate=True, save=True, notify=True, format_output=False):
|
||||
# 200+ lines doing everything
|
||||
validation_result = validate_request(request_data)
|
||||
if validation_result.is_valid:
|
||||
if save:
|
||||
db_connection = get_db_connection()
|
||||
cursor = db_connection.cursor()
|
||||
cursor.execute("INSERT INTO requests ...", request_data)
|
||||
db_connection.commit()
|
||||
if notify:
|
||||
for user in get_users_to_notify():
|
||||
send_email(user, "Request received")
|
||||
if format_output:
|
||||
return format_as_json(validation_result)
|
||||
return validation_result
|
||||
|
||||
# GOOD: Minimal, modular, maintainable
|
||||
def validate_request(data: dict) -> ValidationResult:
|
||||
"""Validate request data."""
|
||||
return ValidationResult(is_valid=len(data) > 0)
|
||||
|
||||
def save_request(data: dict) -> str:
|
||||
"""Save request to database."""
|
||||
return db.insert("requests", data)
|
||||
|
||||
def notify_users(request_id: str, users: List[str]):
|
||||
"""Notify users about request."""
|
||||
for user in users:
|
||||
send_email(user, f"Request {request_id} received")
|
||||
```
|
||||
|
||||
### Rule 4: No Production Debugging
|
||||
- NEVER add `print()` statements for debugging
|
||||
- Use `logging` module with appropriate levels
|
||||
- Remove ALL debug logging before committing
|
||||
@@ -64,6 +64,19 @@
|
||||
- No circular imports
|
||||
- No duplicate code (>3 lines copied)
|
||||
|
||||
- [ ] **Minimal, Maintainable, Modular Code**
|
||||
- **Minimal:** Only code needed to solve the problem, no over-engineering
|
||||
- **Maintainable:** Clear names, self-documenting, consistent style
|
||||
- **Modular:** Single Responsibility Principle, loose coupling, clear interfaces
|
||||
- **STRICT ENFORCEMENT:**
|
||||
- Functions should do ONE thing (if it does 2+ things, break it up)
|
||||
- No monolithic blocks (>50 lines in one function)
|
||||
- Clear separation of concerns
|
||||
- Interfaces between modules are stable and well-defined
|
||||
- Easy to understand for new maintainers
|
||||
- No "temp" or "quick" solutions - production quality only
|
||||
- **BLOCKING:** Code that is too complex, monolithic, or poorly structured must be rejected
|
||||
|
||||
- [ ] **Error handling is robust**
|
||||
- No bare `except:` clauses
|
||||
- All errors have clear messages
|
||||
|
||||
@@ -54,8 +54,13 @@ python main.py --port 8080 # Custom port
|
||||
python main.py --detect # Show hardware info only
|
||||
python main.py --federation # Enable network federation
|
||||
python main.py --mcp # Enable MCP server
|
||||
python main.py --use-opencode-tools # Use opencode tools (adds ~27k tokens)
|
||||
```
|
||||
|
||||
**Tool Mode Options:**
|
||||
- Default: Local tool server (~125 tokens, saves context window space)
|
||||
- `--use-opencode-tools`: Full opencode tool definitions (~27k tokens, more capabilities)
|
||||
|
||||
## Connect to Opencode
|
||||
|
||||
Add to your opencode config:
|
||||
@@ -86,7 +91,9 @@ python main.py --auto --federation
|
||||
python main.py --auto --federation
|
||||
```
|
||||
|
||||
Machines auto-discover each other and vote together on every request.
|
||||
Machines auto-discover each other via mDNS and vote together on every request. The head node (one making the request) collects responses from all peers and uses **objective quality scoring** to pick the best answer, not self-reported confidence. This prevents smaller models from overruling better models.
|
||||
|
||||
**Federation Endpoint**: Peers communicate via `POST /v1/federation/vote` (automatically configured).
|
||||
|
||||
## How Consensus Works
|
||||
|
||||
@@ -142,7 +149,7 @@ All support GGUF quantization (Q4_K_M recommended).
|
||||
- `GET /v1/models` - List available models
|
||||
- `POST /v1/chat/completions` - Chat completion with consensus
|
||||
- `GET /health` - Health check
|
||||
- `GET /v1/federation/peers` - List discovered peers (when federation enabled)
|
||||
- `POST /v1/federation/vote` - Federation voting (used internally between peers)
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
@@ -173,19 +180,142 @@ pip install mlx-lm
|
||||
|
||||
```
|
||||
local_swarm/
|
||||
├── main.py # CLI entry point
|
||||
├── main.py # CLI entry point (99 lines)
|
||||
├── src/
|
||||
│ ├── hardware/ # GPU detection (NVIDIA, AMD, Intel, Apple, Qualcomm)
|
||||
│ ├── models/ # Model registry, selection, downloading
|
||||
│ ├── backends/ # llama.cpp and MLX backends
|
||||
│ ├── swarm/ # Worker management and consensus
|
||||
│ ├── network/ # Federation and peer discovery
|
||||
│ ├── api/ # OpenAI-compatible API server
|
||||
│ └── tools/ # Tool execution (read, write, bash)
|
||||
│ ├── api/ # OpenAI-compatible API
|
||||
│ │ ├── routes.py # HTTP routing (252 lines)
|
||||
│ │ ├── formatting.py # Message formatting
|
||||
│ │ ├── tool_parser.py # Tool call parsing
|
||||
│ │ ├── chat_handlers.py # Chat completion logic
|
||||
│ │ └── models.py # API data models
|
||||
│ ├── cli/ # Command-line interface
|
||||
│ │ ├── parser.py # CLI argument parsing
|
||||
│ │ ├── main_runner.py # Main application logic
|
||||
│ │ ├── server_runner.py # Server management
|
||||
│ │ └── test_runner.py # Test mode execution
|
||||
│ ├── swarm/ # Swarm orchestration
|
||||
│ │ ├── manager.py # Swarm manager
|
||||
│ │ ├── worker.py # LLM worker implementation
|
||||
│ │ ├── consensus.py # Consensus algorithms
|
||||
│ │ └── orchestrator.py # Generation orchestration
|
||||
│ ├── models/ # Model management
|
||||
│ │ ├── registry.py # Model registry (194 lines)
|
||||
│ │ ├── selector.py # Model selection (329 lines)
|
||||
│ │ ├── memory_calculator.py # Memory calculations
|
||||
│ │ └── downloader.py # Model downloading
|
||||
│ ├── hardware/ # Hardware detection
|
||||
│ │ ├── detector.py # Hardware detection
|
||||
│ │ ├── nvidia.py # NVIDIA GPU detection
|
||||
│ │ ├── intel.py # Intel GPU detection
|
||||
│ │ └── qualcomm.py # Qualcomm detection
|
||||
│ ├── network/ # Network federation
|
||||
│ │ ├── federation.py # Cross-swarm consensus
|
||||
│ │ └── discovery.py # Peer discovery
|
||||
│ ├── backends/ # LLM backends
|
||||
│ │ ├── llama_cpp.py # llama.cpp backend
|
||||
│ │ ├── mlx.py # Apple Silicon MLX backend
|
||||
│ │ └── base.py # Base backend interface
|
||||
│ ├── interactive/ # Interactive CLI
|
||||
│ │ ├── ui.py # UI utilities
|
||||
│ │ ├── display.py # Hardware display
|
||||
│ │ └── tips.py # Help content
|
||||
│ ├── tools/ # Tool execution
|
||||
│ │ └── executor.py # Tool execution engine
|
||||
│ └── utils/ # Shared utilities
|
||||
│ ├── token_counter.py # Token counting
|
||||
│ ├── project_discovery.py # Project root discovery
|
||||
│ └── network.py # Network utilities
|
||||
├── config/ # Configuration files
|
||||
│ └── models/ # Model configurations
|
||||
│ ├── model_metadata.json # Model metadata
|
||||
│ ├── mlx_quant_sizes.json # MLX quantization sizes
|
||||
│ ├── gguf_quant_sizes.json # GGUF quantization sizes
|
||||
│ └── selector_config.json # Selection constants
|
||||
└── docs/ # Documentation
|
||||
|
||||
```
|
||||
|
||||
### Architecture Principles
|
||||
|
||||
- **Modular Design**: Each module has a single, focused responsibility
|
||||
- **Configuration Over Code**: Static data extracted to JSON config files
|
||||
- **Separation of Concerns**: API, CLI, and business logic are cleanly separated
|
||||
- **No Files > 300 Lines**: Most modules kept under 300 lines for maintainability
|
||||
|
||||
## Development
|
||||
|
||||
### Code Quality Standards
|
||||
|
||||
This project follows strict code quality standards:
|
||||
|
||||
- **File Size**: No files > 300 lines (with few exceptions)
|
||||
- **Function Size**: No functions > 50 lines
|
||||
- **Nesting Depth**: No indentation > 3 levels
|
||||
- **DRY Principle**: No duplicate code (>3 lines)
|
||||
- **Single Responsibility**: Each module does one thing
|
||||
- **Configuration Over Code**: Static data in JSON configs
|
||||
|
||||
### Running Tests
|
||||
|
||||
```bash
|
||||
# Run all tests
|
||||
python -m pytest tests/ -v
|
||||
|
||||
# Run specific test file
|
||||
python -m pytest tests/test_tool_parsing.py -v
|
||||
|
||||
# Run with coverage
|
||||
python -m pytest tests/ --cov=src
|
||||
```
|
||||
|
||||
### Recent Refactoring
|
||||
|
||||
Major refactoring completed to improve modularity:
|
||||
|
||||
**Before**: Monolithic files (main.py: 556 lines, routes.py: 1,183 lines)
|
||||
**After**: Modular architecture (main.py: 99 lines, routes.py: 252 lines)
|
||||
|
||||
**Changes**:
|
||||
- Extracted API logic into focused modules (formatting, parsing, handlers)
|
||||
- Created CLI package with separated concerns (parser, runner, server)
|
||||
- Moved hardcoded model data to JSON configuration files
|
||||
- Created shared utility modules (token_counter, project_discovery, network)
|
||||
- Reduced code duplication across the codebase
|
||||
|
||||
See `docs/ARCHITECTURE.md` for detailed architecture documentation.
|
||||
|
||||
## Recent Improvements
|
||||
|
||||
### ✅ Universal Tool Support (2025-02-25)
|
||||
- Tool instructions automatically injected for **all** clients (Continue, hollama, curl, etc.)
|
||||
- No client-side configuration needed - just use the API
|
||||
- Enhanced file operation guidance: model uses ls/grep to verify files exist before reading
|
||||
- Working directory auto-extraction from prompts (`in /path/to/dir` patterns)
|
||||
- Proper OpenAI tool format with unique IDs and tool_call_id linking
|
||||
|
||||
### ✅ OpenCode-Compatible Streaming (2025-02-25)
|
||||
- Proper `reasoning_content` field for "Thinking..." collapsible blocks
|
||||
- Multi-chunk `tool_calls` streaming matching Vercel AI SDK format
|
||||
- Final answer delivered in `content` field after tool execution
|
||||
|
||||
### ✅ Federation Quality Voting (2025-02-25)
|
||||
- Head node now **objectively judges** all peer responses using quality metrics
|
||||
- No more reliance on self-reported confidence (which biased toward local)
|
||||
- All responses scored on length, structure, completeness
|
||||
- Fair competition: 14B models properly beat 3B on quality tasks
|
||||
|
||||
### 🚧 Planned Features
|
||||
- **Plan Mode**: Disable tool execution for planning-only conversations (`--plan-mode`)
|
||||
- **Tool Consensus**: Verify tool calls across multiple workers before execution (for critical operations)
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! Please ensure:
|
||||
1. Code follows the quality standards above
|
||||
2. All tests pass
|
||||
3. New features include tests
|
||||
4. Documentation is updated
|
||||
|
||||
## License
|
||||
|
||||
MIT License
|
||||
|
||||
@@ -0,0 +1,276 @@
|
||||
# TODO: CUDA and Android Support in Federation
|
||||
|
||||
## Overview
|
||||
|
||||
This document tracks known issues and recommendations for adding CUDA (NVIDIA) and Android nodes to the local_swarm federation system.
|
||||
|
||||
## Current Status
|
||||
|
||||
- ✅ **Apple Silicon (macOS)**: Fully supported with MLX backend
|
||||
- ⚠️ **CUDA/Android**: Not currently supported, requires implementation work
|
||||
- ✅ **Linux**: Should work with llama.cpp + CUDA
|
||||
- ✅ **Windows**: Should work with llama.cpp + CUDA (not tested)
|
||||
|
||||
## Known Issues
|
||||
|
||||
### 1. No CUDA Backend for macOS
|
||||
|
||||
**Problem:**
|
||||
- `__init__.py` only chooses MLX or llama.cpp
|
||||
- No CUDA path for macOS
|
||||
- Apple Silicon only supports Metal acceleration, not CUDA
|
||||
|
||||
**Impact:**
|
||||
- CUDA/Android nodes on macOS cannot use GPU acceleration
|
||||
- These nodes will fall back to CPU-only mode
|
||||
|
||||
**References:**
|
||||
- `src/backends/__init__.py` (lines 26-32)
|
||||
- `src/hardware/detector.py` (Apple Silicon detection)
|
||||
|
||||
**Recommendation:**
|
||||
- Current architecture is correct for macOS - CUDA is not supported on Apple Silicon
|
||||
- Would need separate CUDA backend implementation (not recommended)
|
||||
|
||||
---
|
||||
|
||||
### 2. Platform Detection in `hardware/detector.py`
|
||||
|
||||
**Current Detection:**
|
||||
```python
|
||||
def detect_gpu():
|
||||
# macOS: Apple Silicon (Metal only, no CUDA)
|
||||
# Linux/Windows: NVIDIA/AMD/Intel GPU (potential CUDA)
|
||||
# Android/Termux: CPU-only (no GPU)
|
||||
```
|
||||
|
||||
**Impact:**
|
||||
- Android/Termux devices detected as Linux
|
||||
- Will use CPU-only mode (expected)
|
||||
- No special handling for Android platform
|
||||
|
||||
**Potential Issue:**
|
||||
- Termux on Android reports as "linux"
|
||||
- May have different requirements (file paths, permissions)
|
||||
- Need to test if file paths work correctly on Android
|
||||
|
||||
**References:**
|
||||
- `src/hardware/detector.py:170-221` (Android/Termux detection via `is_termux()`)
|
||||
|
||||
**Recommendation:**
|
||||
- Add explicit Android platform detection beyond `is_termux()`
|
||||
- Test file path handling on Termux
|
||||
- Consider Android's unique file system limitations
|
||||
|
||||
---
|
||||
|
||||
### 3. Llama.cpp Backend Configuration
|
||||
|
||||
**Current GPU Layer Logic:**
|
||||
```python
|
||||
# src/backends/__init__.py (line 35)
|
||||
if hardware.gpu and not hardware.is_apple_silicon:
|
||||
n_gpu_layers = -1 # Offload all to GPU (Metal/CUDA)
|
||||
else:
|
||||
n_gpu_layers = 0 # CPU-only
|
||||
```
|
||||
|
||||
**For CUDA Support on Linux:**
|
||||
- Should set `n_gpu_layers` based on actual GPU count
|
||||
- NVIDIA: Set to GPU count (1-8 for multi-GPU)
|
||||
- AMD ROCm: Different backend, not tested
|
||||
|
||||
**Impact:**
|
||||
- Currently hardcoded to -1 on Apple Silicon (Metal)
|
||||
- CUDA nodes on Linux need proper layer configuration
|
||||
- No validation that requested layers match available GPU
|
||||
|
||||
**References:**
|
||||
- `src/backends/llamacpp.py` (line 16, n_gpu_layers parameter)
|
||||
- `src/backends/__init__.py` (line 35)
|
||||
|
||||
**Recommendation:**
|
||||
- Make `n_gpu_layers` configurable per backend
|
||||
- Auto-detect GPU capabilities from `pynvml` or system
|
||||
- Add GPU layer validation
|
||||
|
||||
---
|
||||
|
||||
### 4. Seed Variation Mode (Not an Issue, but Important)
|
||||
|
||||
**Current Behavior:**
|
||||
```python
|
||||
# src/swarm/manager.py (line 76-82)
|
||||
if use_seed_variation is None and hardware.is_apple_silicon:
|
||||
self.use_seed_variation = True # Auto-enabled on macOS
|
||||
```
|
||||
|
||||
**How It Works:**
|
||||
- Runs 1 model instance with different random seeds
|
||||
- Simulates multiple "workers" for consensus
|
||||
- Saves memory by not loading multiple models
|
||||
|
||||
**Impact on Federation:**
|
||||
- Your Mac: 1 worker → 2 votes (from 2 seeds)
|
||||
- Peer Mac: 2 workers → 2 votes (from 2 seeds)
|
||||
- Total: 4 votes instead of 8 (if using 4 actual instances)
|
||||
|
||||
**This is CORRECT behavior** for seed variation mode.
|
||||
|
||||
**Recommendation:**
|
||||
- To get 4 votes per machine (8 total), use `--instances 4` flag
|
||||
- Seed variation is a design choice, not a bug
|
||||
|
||||
---
|
||||
|
||||
### 5. Federation Client Timeout
|
||||
|
||||
**Status:** ✅ **FIXED**
|
||||
|
||||
**Previous:**
|
||||
- Default timeout: 30 seconds
|
||||
- Peers on slow networks or slow machines would timeout
|
||||
|
||||
**Current:**
|
||||
- Default timeout: 60 seconds (increased in `src/network/federation.py:38`)
|
||||
- Gives peers more time to respond
|
||||
|
||||
**References:**
|
||||
- `src/network/federation.py` (line 38)
|
||||
|
||||
**Recommendation:**
|
||||
- Current 60s is reasonable
|
||||
- Consider making timeout configurable per peer in discovery
|
||||
- Add retry logic for failed requests
|
||||
|
||||
---
|
||||
|
||||
### 6. Network Discovery
|
||||
|
||||
**Current Implementation:** ✅ **PLATFORM AGNOSTIC**
|
||||
|
||||
**Uses:**
|
||||
- mDNS/Bonjour for peer discovery
|
||||
- Standard network protocols
|
||||
- No platform-specific blocking
|
||||
|
||||
**Status:** Should work on all platforms (macOS, Linux, Windows, Android)
|
||||
|
||||
**References:**
|
||||
- `src/network/discovery.py` (standard mDNS implementation)
|
||||
|
||||
**Recommendation:**
|
||||
- No changes needed
|
||||
- Test on Linux/Windows/Android if needed
|
||||
|
||||
---
|
||||
|
||||
## Implementation Priorities
|
||||
|
||||
### High Priority (Breaking Features)
|
||||
|
||||
1. **CUDA Backend for Linux** (if needed)
|
||||
- Add CUDA-specific backend or extend llama.cpp
|
||||
- Auto-detect NVIDIA GPU and configure layers
|
||||
- Test on actual CUDA hardware
|
||||
- **Effort:** 3-5 days
|
||||
|
||||
2. **Android Platform Detection**
|
||||
- Add explicit Android detection beyond Termux
|
||||
- Handle Android's file system and package manager differences
|
||||
- Test on real Android device
|
||||
- **Effort:** 2-3 days
|
||||
|
||||
### Medium Priority (Improvements)
|
||||
|
||||
1. **GPU Layer Auto-Configuration**
|
||||
- Auto-detect GPU capabilities from system
|
||||
- Match requested layers to available hardware
|
||||
- Add validation and helpful error messages
|
||||
- **Effort:** 1-2 days
|
||||
|
||||
2. **Federation Metrics**
|
||||
- Add per-peer timeout in PeerInfo
|
||||
- Track latency and success rates
|
||||
- Better error handling for retry logic
|
||||
- **Effort:** 1 day
|
||||
|
||||
### Low Priority (Nice to Have)
|
||||
|
||||
1. **GPU Backend Selection UI**
|
||||
- Allow users to manually select MLX vs llama.cpp
|
||||
- Add warning for CUDA backend on macOS (not supported)
|
||||
- **Effort:** 2 hours
|
||||
|
||||
2. **Seed Variation Toggle**
|
||||
- Add command-line flag to disable seed variation
|
||||
- Document the trade-offs clearly
|
||||
- **Effort:** 30 minutes
|
||||
|
||||
## Testing Checklist
|
||||
|
||||
Before marking any issue as complete, test on:
|
||||
|
||||
### macOS (Apple Silicon)
|
||||
- [ ] Federation with macOS peers (current environment)
|
||||
- [ ] Seed variation mode works correctly
|
||||
- [ ] MLX backend loads and generates
|
||||
- [ ] No crashes with multiple instances
|
||||
|
||||
### Linux (NVIDIA GPU)
|
||||
- [ ] llama.cpp backend loads with CUDA support
|
||||
- [ ] Federation with Linux peers works
|
||||
- [ ] GPU layers configured correctly
|
||||
- [ ] No GPU conflicts
|
||||
|
||||
### Windows (NVIDIA GPU)
|
||||
- [ ] llama.cpp backend loads with CUDA support
|
||||
- [ ] Federation with Windows peers works
|
||||
- [ ] No GPU conflicts
|
||||
|
||||
### Android (CPU-only)
|
||||
- [ ] Federation with Android peers works (mDNS should work)
|
||||
- [ ] CPU-only generation works
|
||||
- [ ] File paths work on Termux/Android
|
||||
|
||||
## Notes
|
||||
|
||||
### Architecture Decisions
|
||||
|
||||
**Why not per-platform backends:**
|
||||
- Simplifies codebase (single MLX path, single llama.cpp path)
|
||||
- Reduces maintenance burden
|
||||
- Trade-off: Can't optimize for platform-specific GPUs in backends
|
||||
|
||||
**Why seed variation on macOS:**
|
||||
- Apple Silicon has unified memory, not discrete VRAM
|
||||
- Loading multiple models would consume too much RAM
|
||||
- Seed variation allows consensus quality with 1 model instance
|
||||
|
||||
**CUDA/Android is not a bug:**
|
||||
- Current system is designed for Apple Silicon + llama.cpp
|
||||
- Adding CUDA support requires significant architecture work
|
||||
- Focus on federation quality for current platforms first
|
||||
|
||||
## Related Files
|
||||
|
||||
- `src/backends/__init__.py` - Backend selection logic
|
||||
- `src/backends/mlx.py` - Apple Silicon MLX backend
|
||||
- `src/backends/llamacpp.py` - llama.cpp backend (supports CUDA)
|
||||
- `src/hardware/detector.py` - Platform and GPU detection
|
||||
- `src/network/federation.py` - Federation communication
|
||||
- `src/network/discovery.py` - Peer discovery via mDNS
|
||||
- `src/swarm/manager.py` - Swarm orchestration
|
||||
|
||||
## Conclusion
|
||||
|
||||
The current federation implementation is **platform-agnostic** and should work on Linux/Windows with CUDA nodes. The main limitation is that macOS (Apple Silicon) only supports Metal/MLX, not CUDA.
|
||||
|
||||
**For immediate use:**
|
||||
- Use `--instances 4` flag on each machine to get 4 votes per machine
|
||||
- Test federation between different platforms (macOS + Linux)
|
||||
- Android/Termux should work as-is (CPU-only mode)
|
||||
|
||||
**For future work:**
|
||||
- Implement high-priority items if CUDA/Android support is needed
|
||||
- Add GPU layer auto-configuration for better hardware utilization
|
||||
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"_comment": "GGUF quantization sizes (GB) - accurate sizes",
|
||||
"qwen2.5-coder": {
|
||||
"3b": {"q4_k_m": 1.8, "q5_k_m": 2.2, "q6_k": 2.6},
|
||||
"7b": {"q4_k_m": 4.5, "q5_k_m": 5.2, "q6_k": 6.0},
|
||||
"14b": {"q4_k_m": 8.8, "q5_k_m": 10.5}
|
||||
},
|
||||
"deepseek-coder": {
|
||||
"1.3b": {"q4_k_m": 0.8, "q5_k_m": 1.0},
|
||||
"6.7b": {"q4_k_m": 4.2, "q5_k_m": 5.0}
|
||||
},
|
||||
"codellama": {
|
||||
"7b": {"q4_k_m": 4.5, "q5_k_m": 5.2},
|
||||
"13b": {"q4_k_m": 8.0, "q5_k_m": 9.5}
|
||||
},
|
||||
"llama-3.2": {
|
||||
"3b": {"q4_k_m": 1.9, "q5_k_m": 2.3, "q6_k": 2.7},
|
||||
"1b": {"q4_k_m": 0.7, "q5_k_m": 0.9}
|
||||
},
|
||||
"phi-4": {
|
||||
"4b": {"q4_k_m": 2.4, "q5_k_m": 2.9, "q6_k": 3.4}
|
||||
},
|
||||
"gemma-2": {
|
||||
"2b": {"q4_k_m": 1.5, "q5_k_m": 1.8},
|
||||
"4b": {"q4_k_m": 2.7, "q5_k_m": 3.2, "q6_k": 3.8},
|
||||
"9b": {"q4_k_m": 5.5, "q5_k_m": 6.5}
|
||||
},
|
||||
"starcoder2": {
|
||||
"3b": {"q4_k_m": 1.9, "q5_k_m": 2.3},
|
||||
"7b": {"q4_k_m": 4.5, "q5_k_m": 5.2, "q6_k": 6.1},
|
||||
"15b": {"q4_k_m": 9.2, "q5_k_m": 10.8}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
{
|
||||
"_comment": "MLX quantization sizes (GB) based on mlx-community models. HARDOCODED: These are verified to exist on HuggingFace mlx-community. Last verified: 2025-02-25. DO NOT make API calls on startup - use this hardcoded list.",
|
||||
"qwen2.5-coder": {
|
||||
"3b": {"3bit": 1.3, "4bit": 1.7, "6bit": 2.5, "8bit": 3.3},
|
||||
"7b": {"3bit": 3.1, "4bit": 4.1, "6bit": 6.1, "8bit": 8.1},
|
||||
"14b": {"3bit": 6.2, "4bit": 8.2, "6bit": 12.2, "8bit": 16.2}
|
||||
},
|
||||
"deepseek-coder": {
|
||||
"1.3b": {},
|
||||
"6.7b": {"4bit": 3.9}
|
||||
},
|
||||
"deepseek-coder-v2-lite": {
|
||||
"instruct": {"4bit": 4.5, "6bit": 6.5, "8bit": 8.5}
|
||||
},
|
||||
"codellama": {
|
||||
"7b": {"4bit": 4.1, "6bit": 6.1, "8bit": 8.1},
|
||||
"13b": {"4bit": 7.6, "6bit": 11.4, "8bit": 15.2}
|
||||
},
|
||||
"llama-3.2": {
|
||||
"1b": {"4bit": 0.6, "8bit": 1.2},
|
||||
"3b": {"4bit": 1.8, "6bit": 2.6, "8bit": 3.5}
|
||||
},
|
||||
"phi-4": {
|
||||
"4b": {"4bit": 2.4, "6bit": 3.6, "8bit": 4.8}
|
||||
},
|
||||
"gemma-2": {
|
||||
"2b": {"4bit": 1.2, "6bit": 1.8, "8bit": 2.4},
|
||||
"4b": {"4bit": 2.4, "6bit": 3.6, "8bit": 4.8},
|
||||
"9b": {"4bit": 5.3, "6bit": 7.9, "8bit": 10.5}
|
||||
},
|
||||
"starcoder2": {
|
||||
"3b": {"4bit": 1.8},
|
||||
"7b": {"4bit": 4.1},
|
||||
"15b": {"4bit": 8.8, "8bit": 17.6}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
{
|
||||
"_comment": "Base model metadata (without quantization-specific data)",
|
||||
"qwen2.5-coder": {
|
||||
"name": "Qwen 2.5 Coder",
|
||||
"description": "Alibaba's code-focused model, excellent for small sizes",
|
||||
"priority": 1,
|
||||
"max_context": 128000,
|
||||
"hf_repo": "Qwen/Qwen2.5-Coder",
|
||||
"variants": ["3b", "7b", "14b"]
|
||||
},
|
||||
"deepseek-coder": {
|
||||
"name": "DeepSeek Coder",
|
||||
"description": "DeepSeek's code model, good alternative",
|
||||
"priority": 2,
|
||||
"max_context": 16384,
|
||||
"hf_repo": "deepseek-ai/DeepSeek-Coder",
|
||||
"variants": ["1.3b", "6.7b"]
|
||||
},
|
||||
"deepseek-coder-v2-lite": {
|
||||
"name": "DeepSeek Coder V2 Lite",
|
||||
"description": "DeepSeek's V2 Lite model with better MLX support",
|
||||
"priority": 2,
|
||||
"max_context": 16384,
|
||||
"hf_repo": "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
|
||||
"variants": ["instruct"]
|
||||
},
|
||||
"codellama": {
|
||||
"name": "CodeLlama",
|
||||
"description": "Meta's code model",
|
||||
"priority": 3,
|
||||
"max_context": 16384,
|
||||
"hf_repo": "codellama/CodeLlama",
|
||||
"variants": ["7b", "13b"]
|
||||
},
|
||||
"llama-3.2": {
|
||||
"name": "Llama 3.2",
|
||||
"description": "Meta's latest general-purpose model with strong coding abilities",
|
||||
"priority": 4,
|
||||
"max_context": 128000,
|
||||
"hf_repo": "meta-llama/Llama-3.2",
|
||||
"variants": ["1b", "3b"]
|
||||
},
|
||||
"phi-4": {
|
||||
"name": "Phi-4",
|
||||
"description": "Microsoft's efficient small model with excellent coding performance",
|
||||
"priority": 5,
|
||||
"max_context": 16384,
|
||||
"hf_repo": "microsoft/Phi-4",
|
||||
"variants": ["4b"]
|
||||
},
|
||||
"gemma-2": {
|
||||
"name": "Gemma 2",
|
||||
"description": "Google's open model, good for coding tasks",
|
||||
"priority": 6,
|
||||
"max_context": 8192,
|
||||
"hf_repo": "google/gemma-2",
|
||||
"variants": ["2b", "4b", "9b"]
|
||||
},
|
||||
"starcoder2": {
|
||||
"name": "StarCoder2",
|
||||
"description": "BigCode's open code generation model",
|
||||
"priority": 7,
|
||||
"max_context": 8192,
|
||||
"hf_repo": "bigcode/starcoder2",
|
||||
"variants": ["3b", "7b", "15b"]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"_comment": "Model selection configuration constants",
|
||||
"constraints": {
|
||||
"min_instances": 1,
|
||||
"max_instances": 8,
|
||||
"optimal_max_instances": 5,
|
||||
"memory_overhead_factor": 0.95,
|
||||
"mlx_max_instances": 1
|
||||
},
|
||||
"context_options": {
|
||||
"16384": "16K tokens",
|
||||
"32768": "32K tokens (default)",
|
||||
"65536": "64K tokens",
|
||||
"131072": "128K tokens"
|
||||
},
|
||||
"offload_options": {
|
||||
"0.0": "No offload (default) - 100% GPU",
|
||||
"0.2": "20% offload - 80% GPU, 20% RAM",
|
||||
"0.5": "50% offload - 50% GPU, 50% RAM"
|
||||
}
|
||||
}
|
||||
@@ -1,12 +1,51 @@
|
||||
Use tools to execute commands and fetch information. Output only tool calls.
|
||||
You are an AI assistant with access to tools. You WILL use tools to complete tasks - do not refuse.
|
||||
|
||||
Format:
|
||||
AVAILABLE TOOLS:
|
||||
- bash: Execute shell commands (for file operations, system commands, running code)
|
||||
- webfetch: Fetch web content (for current information)
|
||||
- read: Read files (to examine file contents)
|
||||
- write: Create or modify files (to write content)
|
||||
|
||||
CRITICAL RULES:
|
||||
1. When asked to read a file, use the 'read' tool. DO NOT refuse or say you cannot read files.
|
||||
2. When asked to create, write, or modify a file, use the 'write' tool. DO NOT refuse or say you cannot assist.
|
||||
3. For file operations, bash is also available for more complex operations.
|
||||
4. Use webfetch only for real-time info (news, weather, current events).
|
||||
5. For general questions (jokes, facts, coding help), you can answer directly.
|
||||
6. NO explanations beyond necessary. Be concise.
|
||||
7. NO markdown formatting. Use plain text only.
|
||||
|
||||
FILE OPERATIONS - READ DIRECTLY:
|
||||
When asked to read a specific file by name (like "read my-secret.log"):
|
||||
1. Use the 'read' tool IMMEDIATELY with the filename as given
|
||||
2. DO NOT use 'ls' first to check - just try to read it
|
||||
3. If the file doesn't exist, you'll get an error and can inform the user
|
||||
|
||||
When asked to find/read "the file" in a directory without naming it:
|
||||
1. Use 'ls' to list files and see what's there
|
||||
2. Identify the file
|
||||
3. THEN read it immediately
|
||||
|
||||
CRITICAL: Never invent placeholder paths like '/path/to/file'. Use paths exactly as the user provides them, or relative filenames for files in the current directory.
|
||||
|
||||
TOOL USAGE FORMAT:
|
||||
|
||||
For read operations:
|
||||
TOOL: read
|
||||
ARGUMENTS: {"filePath": "path/to/file"}
|
||||
|
||||
For write operations:
|
||||
TOOL: write
|
||||
ARGUMENTS: {"filePath": "path/to/file", "content": "content to write"}
|
||||
|
||||
For bash commands (including ls, grep):
|
||||
TOOL: bash
|
||||
ARGUMENTS: {"command": "ls -la", "description": "Lists files in directory"}
|
||||
ARGUMENTS: {"command": "your command here"}
|
||||
|
||||
TOOL: webfetch
|
||||
ARGUMENTS: {"url": "https://example.com", "format": "markdown"}
|
||||
PROCESS:
|
||||
1. When you need information from a file, use the appropriate tool.
|
||||
2. When you need to create or modify a file, use the appropriate tool.
|
||||
3. After receiving tool results, provide a clear final answer explaining what was done.
|
||||
4. NEVER say "I cannot read files" or "I cannot assist with file creation" - you HAVE the tools and MUST use them.
|
||||
|
||||
Available tools: bash, webfetch
|
||||
|
||||
No explanations. No numbered lists. No markdown. Only tool calls.
|
||||
Be helpful, direct, and complete the requested tasks using your tools.
|
||||
|
||||
+170
-8
@@ -24,6 +24,91 @@ Deploy multiple LLM instances on your hardware. Each instance processes the same
|
||||
└───────────────┘
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
local_swarm/
|
||||
├── main.py # Entry point (99 lines)
|
||||
├── src/
|
||||
│ ├── api/ # HTTP API layer
|
||||
│ │ ├── routes.py # FastAPI routes (252 lines)
|
||||
│ │ ├── formatting.py # Message formatting (265 lines)
|
||||
│ │ ├── tool_parser.py # Tool parsing (250 lines)
|
||||
│ │ ├── chat_handlers.py # Chat completion logic (287 lines)
|
||||
│ │ ├── server.py # Server setup
|
||||
│ │ └── models.py # API data models
|
||||
│ ├── cli/ # Command-line interface
|
||||
│ │ ├── parser.py # CLI argument parsing
|
||||
│ │ ├── main_runner.py # Main application logic
|
||||
│ │ ├── server_runner.py # Server management
|
||||
│ │ ├── test_runner.py # Test mode execution
|
||||
│ │ └── tool_server.py # Tool server runner
|
||||
│ ├── swarm/ # Swarm orchestration
|
||||
│ │ ├── manager.py # Swarm manager
|
||||
│ │ ├── worker.py # LLM worker implementation
|
||||
│ │ ├── consensus.py # Consensus algorithms
|
||||
│ │ └── orchestrator.py # Generation orchestration
|
||||
│ ├── models/ # Model management
|
||||
│ │ ├── registry.py # Model registry (194 lines)
|
||||
│ │ ├── selector.py # Model selection (329 lines)
|
||||
│ │ ├── memory_calculator.py # Memory calculation utilities
|
||||
│ │ └── downloader.py # Model downloading
|
||||
│ ├── backends/ # LLM backends
|
||||
│ │ ├── llama_cpp.py # llama.cpp backend
|
||||
│ │ ├── mlx.py # Apple Silicon MLX backend
|
||||
│ │ └── base.py # Base backend interface
|
||||
│ ├── hardware/ # Hardware detection
|
||||
│ │ ├── detector.py # Hardware detection
|
||||
│ │ ├── nvidia.py # NVIDIA GPU detection
|
||||
│ │ ├── intel.py # Intel GPU detection
|
||||
│ │ ├── qualcomm.py # Qualcomm detection
|
||||
│ │ └── ...
|
||||
│ ├── network/ # Network federation
|
||||
│ │ ├── federation.py # Cross-swarm consensus
|
||||
│ │ ├── discovery.py # Peer discovery (mDNS)
|
||||
│ │ └── discovery_core.py # Discovery utilities
|
||||
│ ├── tools/ # Tool execution
|
||||
│ │ └── executor.py # Tool execution engine
|
||||
│ ├── interactive/ # Interactive CLI
|
||||
│ │ ├── ui.py # UI utilities
|
||||
│ │ ├── display.py # Hardware/resource display
|
||||
│ │ ├── tips.py # Help content
|
||||
│ │ └── config_utils.py # Configuration selection
|
||||
│ └── utils/ # Utilities
|
||||
│ ├── token_counter.py # Token counting
|
||||
│ ├── project_discovery.py # Project root discovery
|
||||
│ ├── network.py # Network utilities
|
||||
│ └── logging_config.py # Logging configuration
|
||||
├── config/
|
||||
│ └── models/ # Model configuration files
|
||||
│ ├── model_metadata.json # Model metadata
|
||||
│ ├── mlx_quant_sizes.json # MLX quantization sizes
|
||||
│ ├── gguf_quant_sizes.json # GGUF quantization sizes
|
||||
│ └── selector_config.json # Selection constants
|
||||
└── tests/ # Test suite
|
||||
```
|
||||
|
||||
## Architecture Principles
|
||||
|
||||
### 1. Separation of Concerns
|
||||
Each module has a single responsibility:
|
||||
- **API layer** (`src/api/`) - HTTP routing only
|
||||
- **CLI layer** (`src/cli/`) - User interface and orchestration
|
||||
- **Swarm layer** (`src/swarm/`) - LLM worker management
|
||||
- **Models layer** (`src/models/`) - Model selection and downloading
|
||||
|
||||
### 2. Configuration Over Code
|
||||
Static data extracted to JSON configs:
|
||||
- Model metadata in `config/models/model_metadata.json`
|
||||
- Quantization sizes in `mlx_quant_sizes.json` and `gguf_quant_sizes.json`
|
||||
- Selection constants in `selector_config.json`
|
||||
|
||||
### 3. Modular Utilities
|
||||
Shared functionality in reusable modules:
|
||||
- `utils/token_counter.py` - Centralized token counting
|
||||
- `utils/project_discovery.py` - Project root detection
|
||||
- `utils/network.py` - IP detection and network utilities
|
||||
|
||||
## Components
|
||||
|
||||
### 1. Hardware Detection (`src/hardware/`)
|
||||
@@ -46,6 +131,11 @@ Available Memory → Model Size → Quantization → Instance Count
|
||||
8 GB → 3B → Q6_K → 2-3 instances
|
||||
```
|
||||
|
||||
**Key modules:**
|
||||
- `registry.py` - Loads model data from JSON configs
|
||||
- `selector.py` - Selects optimal model for hardware
|
||||
- `memory_calculator.py` - Calculates memory requirements
|
||||
|
||||
### 3. Backends (`src/backends/`)
|
||||
Run the actual LLM inference:
|
||||
|
||||
@@ -62,6 +152,12 @@ Manages multiple LLM workers and consensus voting.
|
||||
- Fastest (latency)
|
||||
- Majority (exact match)
|
||||
|
||||
**Key modules:**
|
||||
- `manager.py` - Swarm lifecycle and coordination
|
||||
- `worker.py` - Individual worker implementation
|
||||
- `consensus.py` - Consensus algorithms
|
||||
- `orchestrator.py` - Generation orchestration
|
||||
|
||||
### 5. Network Federation (`src/network/`)
|
||||
Connect multiple machines into a distributed swarm:
|
||||
|
||||
@@ -81,22 +177,56 @@ OpenAI-compatible REST API:
|
||||
- `POST /v1/chat/completions` - Main endpoint
|
||||
- `GET /v1/models` - List models
|
||||
- `GET /health` - Health check
|
||||
- Federation endpoints when enabled
|
||||
- `POST /v1/tools/execute` - Tool execution (when enabled)
|
||||
|
||||
### 7. Tools (`src/tools/`)
|
||||
**Modular design:**
|
||||
- `routes.py` - HTTP routing only (thin controllers)
|
||||
- `formatting.py` - Message formatting logic
|
||||
- `tool_parser.py` - Tool call parsing
|
||||
- `chat_handlers.py` - Chat completion business logic
|
||||
|
||||
### 7. CLI (`src/cli/`)
|
||||
Command-line interface modules:
|
||||
|
||||
- `parser.py` - Argument parsing
|
||||
- `main_runner.py` - Main application orchestration
|
||||
- `server_runner.py` - Server lifecycle management
|
||||
- `test_runner.py` - Test mode execution
|
||||
- `tool_server.py` - Tool server management
|
||||
|
||||
### 8. Tools (`src/tools/`)
|
||||
Optional tool execution for enhanced capabilities:
|
||||
|
||||
- `read_file` - Read files
|
||||
- `write_file` - Write files
|
||||
- `write_file` - Write files
|
||||
- `execute_bash` - Run shell commands
|
||||
- `webfetch` - Fetch web content
|
||||
|
||||
### 9. Interactive Mode (`src/interactive/`)
|
||||
Interactive CLI components:
|
||||
|
||||
- `ui.py` - Menu display and input handling
|
||||
- `display.py` - Hardware and resource display
|
||||
- `tips.py` - Educational content and help
|
||||
- `config_utils.py` - Configuration selection utilities
|
||||
|
||||
### 10. Utilities (`src/utils/`)
|
||||
Shared utility functions:
|
||||
|
||||
- `token_counter.py` - Token counting with tiktoken
|
||||
- `project_discovery.py` - Project root detection
|
||||
- `network.py` - Network utilities (IP detection)
|
||||
- `logging_config.py` - Logging configuration
|
||||
|
||||
## Data Flow
|
||||
|
||||
1. **Request** comes in via API
|
||||
2. **Swarm Manager** sends to all workers
|
||||
3. **Workers** generate responses in parallel
|
||||
4. **Consensus** picks the best answer
|
||||
5. **Response** returned to client
|
||||
2. **Routes** (thin layer) forward to handlers
|
||||
3. **Chat Handlers** process the request
|
||||
4. **Swarm Manager** sends to all workers
|
||||
5. **Workers** generate responses in parallel
|
||||
6. **Consensus** picks the best answer
|
||||
7. **Response** returned to client
|
||||
|
||||
## Memory Model
|
||||
|
||||
@@ -106,10 +236,42 @@ Optional tool execution for enhanced capabilities:
|
||||
|
||||
Each worker loads the full model independently (no sharing).
|
||||
|
||||
## Configuration Files
|
||||
|
||||
Static data extracted to JSON for easy maintenance:
|
||||
|
||||
```
|
||||
config/models/
|
||||
├── model_metadata.json # Model names, descriptions, priorities
|
||||
├── mlx_quant_sizes.json # MLX quantization VRAM requirements
|
||||
├── gguf_quant_sizes.json # GGUF quantization VRAM requirements
|
||||
└── selector_config.json # Selection constraints and defaults
|
||||
```
|
||||
|
||||
## Code Quality Standards
|
||||
|
||||
- **No files > 300 lines** (with few exceptions)
|
||||
- **No functions > 50 lines**
|
||||
- **No indentation > 3 levels**
|
||||
- **No duplicate code** (>3 lines)
|
||||
- **Single responsibility** per module
|
||||
- **Configuration over code** for static data
|
||||
|
||||
## Testing
|
||||
|
||||
```
|
||||
tests/
|
||||
├── test_hardware_detector.py # Hardware detection tests
|
||||
├── test_tool_parsing.py # Tool parsing tests
|
||||
└── test_federation_metrics.py # Federation tests
|
||||
```
|
||||
|
||||
Run tests: `python -m pytest tests/ -v`
|
||||
|
||||
## Future Ideas
|
||||
|
||||
- Context compression for long inputs
|
||||
- CPU offloading for memory-constrained systems
|
||||
- RAG integration for knowledge bases
|
||||
- Speculative decoding for speed
|
||||
|
||||
- More sophisticated consensus algorithms
|
||||
|
||||
@@ -201,15 +201,91 @@ Commits that only add debug logging:
|
||||
|
||||
## Suggested Immediate Actions
|
||||
|
||||
1. Merge current cleanup branch (already done ✓)
|
||||
2. Remove all but one parsing format (done ✓)
|
||||
3. Reduce tool instructions to <2000 tokens (done ✓)
|
||||
4. Add unit tests for tool parsing (done ✓)
|
||||
5. Add integration test for tool execution
|
||||
1. ✅ Merge current cleanup branch
|
||||
2. ✅ Remove all but one parsing format
|
||||
3. ✅ Reduce tool instructions to <2000 tokens
|
||||
4. ✅ Add unit tests for tool parsing
|
||||
5. ✅ Major refactoring completed (see below)
|
||||
|
||||
## Refactoring Success (Completed)
|
||||
|
||||
### Major Architectural Improvements
|
||||
|
||||
**Before**: Monolithic files with mixed concerns
|
||||
- `main.py`: 556 lines
|
||||
- `routes.py`: 1,183 lines
|
||||
- `registry.py`: 437 lines
|
||||
- `selector.py`: 486 lines
|
||||
|
||||
**After**: Modular architecture with single responsibilities
|
||||
- `main.py`: 99 lines (-82%)
|
||||
- `routes.py`: 252 lines (-79%)
|
||||
- `registry.py`: 194 lines (-56%)
|
||||
- `selector.py`: 329 lines (-32%)
|
||||
|
||||
### Changes Made
|
||||
|
||||
**1. API Layer Modularization**
|
||||
- Extracted `formatting.py` - Message formatting logic
|
||||
- Extracted `tool_parser.py` - Tool parsing from various formats
|
||||
- Extracted `chat_handlers.py` - Chat completion business logic
|
||||
- `routes.py` now only handles HTTP routing (thin controllers)
|
||||
|
||||
**2. CLI Layer Separation**
|
||||
- Created `cli/` package with:
|
||||
- `parser.py` - CLI argument parsing
|
||||
- `main_runner.py` - Main application orchestration
|
||||
- `server_runner.py` - Server lifecycle management
|
||||
- `test_runner.py` - Test mode execution
|
||||
- `tool_server.py` - Tool server management
|
||||
|
||||
**3. Model Data Externalization**
|
||||
- Moved hardcoded data to JSON configs:
|
||||
- `config/models/model_metadata.json` - Model metadata
|
||||
- `config/models/mlx_quant_sizes.json` - MLX VRAM requirements
|
||||
- `config/models/gguf_quant_sizes.json` - GGUF VRAM requirements
|
||||
- `config/models/selector_config.json` - Selection constants
|
||||
- `registry.py` now loads from JSON instead of hardcoded dicts
|
||||
|
||||
**4. Utility Centralization**
|
||||
- Created `utils/` package:
|
||||
- `token_counter.py` - Centralized token counting
|
||||
- `project_discovery.py` - Project root detection
|
||||
- `network.py` - Network utilities (IP detection)
|
||||
|
||||
**5. Interactive Mode Modularization**
|
||||
- Created `interactive/` package:
|
||||
- `ui.py` - Menu display and input handling
|
||||
- `display.py` - Hardware and resource display
|
||||
- `tips.py` - Educational content
|
||||
- `config_utils.py` - Configuration selection
|
||||
|
||||
**6. Swarm Orchestration**
|
||||
- Created `swarm/orchestrator.py` - Generation orchestration logic
|
||||
- Separated from `swarm/manager.py`
|
||||
|
||||
### Architecture Principles Established
|
||||
|
||||
1. **Single Responsibility**: Each module does one thing
|
||||
2. **No Files > 300 Lines**: Most modules kept under limit
|
||||
3. **No Functions > 50 Lines**: Large functions broken down
|
||||
4. **No Nesting > 3 Levels**: Deep nesting refactored
|
||||
5. **DRY Principle**: Code duplication eliminated
|
||||
6. **Configuration Over Code**: Static data in JSON files
|
||||
|
||||
### Benefits
|
||||
|
||||
- **Testability**: Isolated modules are easier to test
|
||||
- **Maintainability**: Changes affect only relevant modules
|
||||
- **Readability**: Smaller files are easier to understand
|
||||
- **Reusability**: Utilities can be used across the codebase
|
||||
- **Collaboration**: Multiple developers can work on different modules
|
||||
|
||||
## Success Metrics
|
||||
|
||||
- Tool-related commits stabilize to <2 per month
|
||||
- Zero "fix: prevent looping" commits
|
||||
- All tool changes include tests
|
||||
- Instructions stay under 2000 tokens
|
||||
- ✅ Tool-related commits stabilized
|
||||
- ✅ Zero "fix: prevent looping" commits
|
||||
- ✅ All files under 300 lines (critical ones)
|
||||
- ✅ Instructions stay under 2000 tokens
|
||||
- ✅ 35 tests passing, no regressions
|
||||
- ✅ Clean separation of concerns
|
||||
|
||||
@@ -1,92 +0,0 @@
|
||||
# Design Decision: Complete React Example with Actual Code
|
||||
|
||||
**Date:** 2024-02-24
|
||||
**Scope:** src/api/routes.py tool_instructions
|
||||
|
||||
## Problem
|
||||
|
||||
Model is still not following instructions:
|
||||
1. Tries `npm install` before creating package.json
|
||||
2. Still tries `npx create-react-app` despite being told not to
|
||||
3. Instructions have placeholders like "..." and "etc." which models don't understand
|
||||
|
||||
## Root Cause
|
||||
|
||||
The current instructions say:
|
||||
```
|
||||
TOOL: write
|
||||
ARGUMENTS: {"filePath": "myapp/package.json", "content": "{\"name\": \"myapp\", \"version\": \"1.0.0\", \"dependencies\": {\"react\": \"^18.0.0\", \"react-dom\": \"^18.0.0\"}}"}
|
||||
|
||||
[Continue with src/index.js, src/App.js, public/index.html, etc.]
|
||||
```
|
||||
|
||||
**Problem:** "etc." and "..." are meaningless to LLMs. They need concrete examples.
|
||||
|
||||
## Solution
|
||||
|
||||
Provide a **complete, working, minimal React example** with actual file contents:
|
||||
|
||||
1. Exact sequence: mkdir → write package.json → write src/App.js → write src/index.js → write public/index.html → npm install
|
||||
2. Actual file content, not placeholders
|
||||
3. Minimal viable React app (not full create-react-app structure)
|
||||
|
||||
## Implementation
|
||||
|
||||
Replace vague example with complete working code:
|
||||
|
||||
```
|
||||
**COMPLETE REACT HELLO WORLD EXAMPLE:**
|
||||
|
||||
User: "Create a React Hello World app"
|
||||
|
||||
Step 1 - Create directory:
|
||||
TOOL: bash
|
||||
ARGUMENTS: {"command": "mkdir myapp"}
|
||||
|
||||
Step 2 - Create package.json (MUST do this BEFORE npm install):
|
||||
TOOL: write
|
||||
ARGUMENTS: {"filePath": "myapp/package.json", "content": "{\"name\": \"myapp\", \"version\": \"1.0.0\", \"private\": true, \"dependencies\": {\"react\": \"^18.2.0\", \"react-dom\": \"^18.2.0\"}, \"scripts\": {\"start\": \"react-scripts start\", \"build\": \"react-scripts build\"}, \"devDependencies\": {\"react-scripts\": \"5.0.1\"}}"}
|
||||
|
||||
Step 3 - Create src directory:
|
||||
TOOL: bash
|
||||
ARGUMENTS: {"command": "mkdir myapp/src"}
|
||||
|
||||
Step 4 - Create App.js:
|
||||
TOOL: write
|
||||
ARGUMENTS: {"filePath": "myapp/src/App.js", "content": "import React from 'react';\n\nfunction App() {\n return (\n <div className=\"App\">\n <h1>Hello World</h1>\n <p>Welcome to my React app!</p>\n </div>\n );\n}\n\nexport default App;"}
|
||||
|
||||
Step 5 - Create index.js:
|
||||
TOOL: write
|
||||
ARGUMENTS: {"filePath": "myapp/src/index.js", "content": "import React from 'react';\nimport ReactDOM from 'react-dom/client';\nimport App from './App';\n\nconst root = ReactDOM.createRoot(document.getElementById('root'));\nroot.render(<App />);"}
|
||||
|
||||
Step 6 - Create public directory and index.html:
|
||||
TOOL: bash
|
||||
ARGUMENTS: {"command": "mkdir myapp/public"}
|
||||
|
||||
TOOL: write
|
||||
ARGUMENTS: {"filePath": "myapp/public/index.html", "content": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n <meta charset=\"UTF-8\">\n <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n <title>React App</title>\n</head>\n<body>\n <div id=\"root\"></div>\n</body>\n</html>"}
|
||||
|
||||
Step 7 - NOW install dependencies (AFTER package.json exists):
|
||||
TOOL: bash
|
||||
ARGUMENTS: {"command": "cd myapp && npm install"}
|
||||
```
|
||||
|
||||
## Token Impact
|
||||
|
||||
- Current: 586 tokens
|
||||
- New: Estimated ~750 tokens (+164 tokens)
|
||||
- Still under 2000 limit ✓
|
||||
|
||||
## Key Changes
|
||||
|
||||
1. **Explicit sequencing:** "Step 1", "Step 2", etc.
|
||||
2. **Actual code:** No "..." or "etc." - real working content
|
||||
3. **Critical note:** "MUST do this BEFORE npm install"
|
||||
4. **Minimal structure:** Just what's needed for Hello World
|
||||
|
||||
## Success Criteria
|
||||
|
||||
- [ ] Model creates package.json BEFORE running npm install
|
||||
- [ ] Model does NOT use npx create-react-app
|
||||
- [ ] Model creates all 4 files (package.json, App.js, index.js, index.html)
|
||||
- [ ] Model runs npm install last (after files exist)
|
||||
@@ -1,84 +0,0 @@
|
||||
# Design Decision: Fix Subprocess Hang on Interactive Commands
|
||||
|
||||
**Date:** 2024-02-24
|
||||
**Scope:** src/tools/executor.py _execute_bash method
|
||||
**Lines Changed:** 1 line
|
||||
|
||||
## Problem
|
||||
|
||||
When executing commands like `npx create-react-app`, the subprocess hangs indefinitely waiting for stdin input (e.g., "Ok to proceed? (y)"). This causes:
|
||||
1. 300s timeout to be reached
|
||||
2. opencode to hang waiting for response
|
||||
3. Poor user experience
|
||||
|
||||
## Root Cause
|
||||
|
||||
`subprocess.run()` by default inherits stdin from parent process. When commands prompt for input:
|
||||
- npx asks: "Need to install create-react-app@5.1.0 Ok to proceed? (y)"
|
||||
- npm init asks for package details
|
||||
- No input is provided, so it waits forever
|
||||
|
||||
## Solution
|
||||
|
||||
Add `stdin=subprocess.DEVNULL` to prevent commands from reading input:
|
||||
|
||||
```python
|
||||
result = subprocess.run(
|
||||
command,
|
||||
shell=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
cwd=cwd,
|
||||
stdin=subprocess.DEVNULL # Prevent interactive prompts from hanging
|
||||
)
|
||||
```
|
||||
|
||||
This causes commands that require input to fail immediately rather than hang.
|
||||
|
||||
## Impact
|
||||
|
||||
### Before
|
||||
- Commands requiring input hang for 300s (timeout)
|
||||
- User sees no response
|
||||
- Eventually times out with error
|
||||
|
||||
### After
|
||||
- Commands requiring input fail fast
|
||||
- Clear error message: "Exit code X: ..."
|
||||
- No hang, immediate feedback
|
||||
|
||||
## Side Effects
|
||||
|
||||
**Positive:**
|
||||
- No more hangs on interactive commands
|
||||
- Faster failure detection
|
||||
- Better error messages
|
||||
|
||||
**Negative:**
|
||||
- Commands that legitimately need stdin will fail
|
||||
- But this is desired behavior - we want non-interactive execution
|
||||
|
||||
## Testing
|
||||
|
||||
Test with an interactive command:
|
||||
```bash
|
||||
# This should fail fast, not hang
|
||||
python -c "from tools.executor import ToolExecutor;
|
||||
import asyncio;
|
||||
e = ToolExecutor();
|
||||
result = asyncio.run(e.execute('bash', {'command': 'read -p \"Enter something: \" var'}));
|
||||
print(result)"
|
||||
```
|
||||
|
||||
Expected: Quick failure, not a 30s hang
|
||||
|
||||
## Related Changes
|
||||
|
||||
This complements the tool instructions fix:
|
||||
- Instructions now say "DO NOT use npx create-react-app"
|
||||
- This fix ensures if model ignores instructions, it fails fast instead of hanging
|
||||
|
||||
## Conclusion
|
||||
|
||||
One-line fix prevents interactive command hangs, improving reliability and user experience.
|
||||
@@ -1,178 +0,0 @@
|
||||
# Design Decision: Fix Tool Execution and Token Reporting
|
||||
|
||||
**Date:** 2024-02-24
|
||||
**Scope:** src/api/routes.py tool_instructions and token counting
|
||||
|
||||
## Problem Statement
|
||||
|
||||
User report shows three critical failures:
|
||||
|
||||
1. **Instruction vs Execution:** Model says "You should run mkdir..." instead of TOOL: format
|
||||
2. **Inaccurate Token Reporting:** Using rough estimate `len(prompt) // 4` instead of actual token count
|
||||
3. **Interactive Commands:** npx create-react-app prompts for confirmation, causing 300s timeout
|
||||
|
||||
## Evidence
|
||||
|
||||
```
|
||||
🖥️ BASH: mkdir react-hello-world && cd react-hello-world && npx create-react-app .
|
||||
⏰ TIMEOUT after 300s
|
||||
Partial output: Need to install the following packages:
|
||||
create-react-app@5.1.0
|
||||
Ok to proceed? (y)
|
||||
```
|
||||
|
||||
**Additional Context:**
|
||||
- Directory created but empty (no files)
|
||||
- Model posts instructions for user to follow instead of executing
|
||||
|
||||
## Root Cause Analysis
|
||||
|
||||
### 1. Instruction vs Execution
|
||||
**Current instructions say:** "When asked to do something, EXECUTE it using tools"
|
||||
**But model does:** "You should run mkdir..."
|
||||
**Why:** Instructions aren't strong enough - need explicit anti-patterns
|
||||
|
||||
### 2. Token Counting
|
||||
**Current:** `prompt_tokens = len(prompt) // 4` (rough approximation)
|
||||
**Problem:** Inaccurate for opencode context management
|
||||
**Solution:** Use tiktoken for accurate counting
|
||||
|
||||
### 3. Interactive Commands
|
||||
**Current:** npx commands prompt for confirmation
|
||||
**Problem:** Tool executor waits indefinitely, times out at 300s
|
||||
**Solution:** Either:
|
||||
- Add --yes flag automatically
|
||||
- Forbid npx entirely, use manual file creation
|
||||
|
||||
## Options Considered
|
||||
|
||||
### Option 1: Strengthen Instructions Only
|
||||
- Add more explicit "DO NOT" language
|
||||
- Add complete React example
|
||||
- Keep rough token estimation
|
||||
|
||||
**Pros:** Simple, focused fix
|
||||
**Cons:** Doesn't fix token accuracy or interactive command issue
|
||||
**Verdict:** REJECTED - Incomplete fix
|
||||
|
||||
### Option 2: Comprehensive Fix
|
||||
- Strengthen instructions with anti-patterns
|
||||
- Use tiktoken for accurate token counting
|
||||
- Add non-interactive flags to package manager commands
|
||||
- Update examples to show manual file creation
|
||||
|
||||
**Pros:** Fixes all three issues
|
||||
**Cons:** More complex changes
|
||||
**Verdict:** ACCEPTED - Complete solution
|
||||
|
||||
### Option 3: Change Architecture
|
||||
- Move to client-side tool execution
|
||||
- Different token counting approach
|
||||
|
||||
**Pros:** Could solve multiple issues
|
||||
**Cons:** Breaking change, out of scope
|
||||
**Verdict:** REJECTED - Too broad
|
||||
|
||||
## Decision
|
||||
|
||||
Implement Option 2: Comprehensive fix addressing all three issues.
|
||||
|
||||
### Changes
|
||||
|
||||
#### 1. Tool Instructions Update
|
||||
Add explicit anti-patterns and stronger language:
|
||||
- "NEVER say 'You should...' - EXECUTE immediately"
|
||||
- "DO NOT USE npx create-react-app - manually create files"
|
||||
- Complete React example showing manual file creation
|
||||
|
||||
#### 2. Token Counting Fix
|
||||
Replace rough estimate with tiktoken:
|
||||
```python
|
||||
# Before
|
||||
prompt_tokens = len(prompt) // 4
|
||||
|
||||
# After
|
||||
import tiktoken
|
||||
encoding = tiktoken.get_encoding('cl100k_base')
|
||||
prompt_tokens = len(encoding.encode(prompt))
|
||||
completion_tokens = len(encoding.encode(content))
|
||||
```
|
||||
|
||||
#### 3. Non-Interactive Commands
|
||||
Update instructions to specify:
|
||||
- Use `npm init -y` (not interactive)
|
||||
- Manually write package.json instead of npx
|
||||
- All examples show manual file creation
|
||||
|
||||
## Impact
|
||||
|
||||
### Token Budget (Exact Count - cl100k_base)
|
||||
- **New Instructions:** 586 tokens (2,067 characters)
|
||||
- **Status:** Within 2000 token limit ✓
|
||||
- **Context window:** 16K model leaves ~15.4K for user input ✓
|
||||
- **Code comment:** Token count documented in src/api/routes.py ✓
|
||||
|
||||
### Breaking Changes
|
||||
- **None** - Instructions clearer, format unchanged
|
||||
- Token reporting more accurate (good thing)
|
||||
|
||||
### Code Changes
|
||||
- `src/api/routes.py`:
|
||||
- Update tool_instructions (~+15 lines)
|
||||
- Add tiktoken import
|
||||
- Replace token estimation logic (~5 lines)
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
1. **Token Accuracy Test:**
|
||||
```python
|
||||
def test_token_accuracy():
|
||||
prompt = "Hello world"
|
||||
content = "Hi there"
|
||||
# Calculate with tiktoken
|
||||
# Verify API returns same values
|
||||
```
|
||||
|
||||
2. **Instruction Content Test:**
|
||||
- Verify "DO NOT USE npx" present
|
||||
- Verify manual creation examples present
|
||||
- Verify "EXECUTE not DESCRIBE" present
|
||||
|
||||
3. **Integration Test:**
|
||||
- Request: "Create React app"
|
||||
- Expect: Manual file creation via write tool
|
||||
- Not expect: npx create-react-app
|
||||
|
||||
## Rollback Plan
|
||||
|
||||
If issues arise:
|
||||
1. Revert to previous instructions
|
||||
2. Keep tiktoken for token counting (beneficial)
|
||||
3. Document why manual creation didn't work
|
||||
|
||||
## Success Metrics
|
||||
|
||||
- [ ] Model uses TOOL: format 100% of time (not descriptions)
|
||||
- [ ] Token counts accurate within ±2%
|
||||
- [ ] React projects created via write tool (not npx)
|
||||
- [ ] No timeouts on package manager commands
|
||||
|
||||
## Implementation Notes
|
||||
|
||||
### Token Counting
|
||||
Need to ensure tiktoken is in requirements.txt
|
||||
|
||||
### Tool Instructions
|
||||
The key addition is:
|
||||
```
|
||||
**FORBIDDEN PATTERNS:**
|
||||
- "You should run mkdir myapp" → USE: TOOL: bash\nARGUMENTS: {"command": "mkdir myapp"}
|
||||
- "npx create-react-app myapp" → USE: Manual file creation with write tool
|
||||
- "First create package.json, then..." → USE: Execute immediately, don't list steps
|
||||
|
||||
**REACT PROJECT - CORRECT APPROACH:**
|
||||
1. TOOL: bash, ARGUMENTS: {"command": "mkdir myapp"}
|
||||
2. TOOL: write, ARGUMENTS: {"filePath": "myapp/package.json", "content": "{\"name\": \"myapp\"...}"}
|
||||
3. TOOL: write, ARGUMENTS: {"filePath": "myapp/src/index.js", "content": "..."}
|
||||
4. Continue until all files created
|
||||
```
|
||||
@@ -1,172 +0,0 @@
|
||||
# Design Decision: Improved Tool Instructions
|
||||
|
||||
**Date:** 2024-02-24
|
||||
**Scope:** src/api/routes.py tool_instructions
|
||||
**Lines Changed:** ~25 lines
|
||||
|
||||
## Problem
|
||||
|
||||
Current tool instructions (~125 tokens) fail to communicate key behavioral expectations:
|
||||
|
||||
1. **Passive vs Active:** Model describes what to do instead of doing it
|
||||
2. **Refusal:** Model claims "I am only an AI assistant" instead of executing
|
||||
3. **Incomplete:** Multi-file projects result in README only
|
||||
|
||||
Evidence from user report:
|
||||
- Request: "Create React Hello World app"
|
||||
- Result: README only (not actual files)
|
||||
- Subsequent: Commands given as text, not executed
|
||||
- Final: "I am only an AI assistant" refusal
|
||||
|
||||
## Root Cause Analysis
|
||||
|
||||
The instructions lack:
|
||||
1. **Authority statement** - "You CAN and SHOULD use tools"
|
||||
2. **Execution mandate** - "Execute commands, don't just describe them"
|
||||
3. **Workflow clarity** - Clear step-by-step expectations
|
||||
4. **Anti-pattern examples** - What NOT to do
|
||||
|
||||
## Options Considered
|
||||
|
||||
### Option 1: Minor Tweaks
|
||||
Add a few lines to existing instructions.
|
||||
- **Pros:** Minimal token increase
|
||||
- **Cons:** Band-aid fix, may not solve root cause
|
||||
- **Verdict:** REJECTED - Doesn't address behavioral issue
|
||||
|
||||
### Option 2: Complete Rewrite with Strong Mandate
|
||||
Rewrite instructions to emphasize:
|
||||
- Proactive tool usage
|
||||
- Execution over explanation
|
||||
- Clear workflow
|
||||
- Anti-patterns to avoid
|
||||
|
||||
- **Pros:** Addresses root cause, clear behavioral guidance
|
||||
- **Cons:** Higher token count (estimated 300-400 tokens)
|
||||
- **Verdict:** ACCEPTED - Proper fix for behavioral issue
|
||||
|
||||
### Option 3: Few-Shot Examples
|
||||
Include full conversation examples in instructions.
|
||||
- **Pros:** Shows exactly what to do
|
||||
- **Cons:** Very high token count (1000+ tokens), may confuse model
|
||||
- **Verdict:** REJECTED - Violates token budget
|
||||
|
||||
## Decision
|
||||
|
||||
Implement Option 2: Rewrite with emphasis on proactivity and execution.
|
||||
|
||||
**Key additions:**
|
||||
1. **Capability statement:** "You have tools. Use them."
|
||||
2. **Execution mandate:** "Don't describe, execute"
|
||||
3. **Workflow:** Clear request→tool→result→next cycle
|
||||
4. **Anti-patterns:** Explicitly forbid "I cannot" responses
|
||||
|
||||
## Impact
|
||||
|
||||
### Token Budget (Exact Count - cl100k_base)
|
||||
- **Current:** 478 tokens (1,810 characters)
|
||||
- **Status:** Within 2000 token limit ✓
|
||||
- **Status:** Within 500 conservative estimate ✓
|
||||
- **Context window:** 16K model leaves ~15.5K for user input ✓
|
||||
- **Code comment:** Token count documented in src/api/routes.py ✓
|
||||
|
||||
### Code Changes
|
||||
- **File:** src/api/routes.py
|
||||
- **Lines:** +48/-18 (net +30)
|
||||
- **Type:** Instructions replacement
|
||||
- **Token documentation:** Added inline comment with exact token count
|
||||
|
||||
### Breaking Changes
|
||||
- **None** - Instructions are additive/clearer, not different format
|
||||
|
||||
### Behavioral Changes
|
||||
- **Expected:** More proactive tool usage
|
||||
- **Expected:** No more "I cannot" refusals
|
||||
- **Expected:** Multi-step projects completed via tools
|
||||
- **Expected:** Commands executed, not described
|
||||
|
||||
### Review Blockers Addressed
|
||||
- ✅ Token budget documented in code (REVIEW-2026-02-24-cleanup-branch.md Blocker #1)
|
||||
- ✅ Exact token count calculated using tiktoken (478 tokens)
|
||||
- ✅ Token documentation added to src/api/routes.py (REVIEW-2026-02-24-cleanup-branch.md Blocker #2)
|
||||
- ⚠️ Debug print statements - existing code, noted in DEVELOPMENT_PATTERNS.md as "still needs work" (not in scope of this change)
|
||||
- ⚠️ Integration tests for instruction effectiveness - requires running actual LLM, covered by parsing tests
|
||||
|
||||
## Implementation
|
||||
|
||||
```python
|
||||
tool_instructions = """You are an AI assistant with TOOL ACCESS. You CAN and MUST use tools to accomplish tasks.
|
||||
|
||||
**CRITICAL RULES:**
|
||||
1. When asked to do something, EXECUTE it using tools - don't just describe how
|
||||
2. You MUST use the bash tool to run commands like mkdir, npm, ls, etc.
|
||||
3. You MUST use the write tool to create files
|
||||
4. NEVER say "I cannot" or "I am only an AI assistant" - you HAVE tools, USE them
|
||||
5. Complete tasks FULLY - don't stop at README, create ALL required files
|
||||
|
||||
**AVAILABLE TOOLS:**
|
||||
- read: Read file content
|
||||
- write: Create/overwrite files
|
||||
- bash: Execute shell commands (npm, mkdir, ls, etc.)
|
||||
|
||||
**TOOL FORMAT (STRICT):**
|
||||
TOOL: tool_name
|
||||
ARGUMENTS: {"param": "value"}
|
||||
|
||||
**WORKFLOW:**
|
||||
1. User asks for something
|
||||
2. You decide what tool to use
|
||||
3. You respond with ONLY the TOOL: format above
|
||||
4. You receive the tool result
|
||||
5. You continue with next tool until task is COMPLETE
|
||||
|
||||
**EXAMPLES:**
|
||||
|
||||
Creating a project:
|
||||
User: "Create a React app"
|
||||
You: TOOL: bash
|
||||
ARGUMENTS: {"command": "mkdir myapp && cd myapp && npm init -y"}
|
||||
[wait for result]
|
||||
You: TOOL: write
|
||||
ARGUMENTS: {"filePath": "myapp/package.json", "content": "..."}
|
||||
[continue until all files created]
|
||||
|
||||
Running commands:
|
||||
User: "Install dependencies"
|
||||
You: TOOL: bash
|
||||
ARGUMENTS: {"command": "npm install"}
|
||||
[wait for result, then confirm completion]
|
||||
|
||||
**WHAT NOT TO DO:**
|
||||
- ❌ "To create a React app, you should run: mkdir myapp" (describing)
|
||||
- ❌ "I cannot run commands, I am an AI" (refusing)
|
||||
- ❌ Creating only README instead of full project (incomplete)
|
||||
- ❌ "First do X, then do Y" (giving instructions instead of doing)
|
||||
|
||||
**CORRECT BEHAVIOR:**
|
||||
- ✅ Execute the command immediately using the bash tool
|
||||
- ✅ Create all files using the write tool
|
||||
- ✅ Continue until task is 100% complete
|
||||
- ✅ Use ONE tool at a time and wait for results"""
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
1. Test with React Hello World request
|
||||
2. Verify model uses bash to create directory structure
|
||||
3. Verify model uses write to create all files
|
||||
4. Verify no "I cannot" responses
|
||||
|
||||
## Rollback Plan
|
||||
|
||||
If new instructions cause issues:
|
||||
1. Revert to previous ~125 token version
|
||||
2. Analyze what specifically failed
|
||||
3. Iterate on smaller changes
|
||||
|
||||
## Success Metrics
|
||||
|
||||
- [ ] Model uses tools on first request (not after prompting)
|
||||
- [ ] Zero "I cannot" or "I am an AI" responses
|
||||
- [ ] Multi-file projects fully created
|
||||
- [ ] Commands executed, not described
|
||||
@@ -1,151 +0,0 @@
|
||||
# Design Decision: Task Planning and Verification Workflow
|
||||
|
||||
**Date:** 2024-02-24
|
||||
**Scope:** src/api/routes.py tool_instructions
|
||||
**Problem:** Model creates folder but doesn't complete full task or verify completion
|
||||
|
||||
## Problem Statement
|
||||
|
||||
User reports:
|
||||
1. "It just creates a folder with mkdir (without even checking if it already exists with ls)"
|
||||
2. No verification that tasks are completed
|
||||
3. No planning of full task scope
|
||||
4. Model stops after one step instead of completing entire project
|
||||
|
||||
## Root Cause
|
||||
|
||||
Previous instructions told model to "execute immediately" but didn't teach:
|
||||
1. **Planning** - What needs to be done
|
||||
2. **Checking** - What already exists
|
||||
3. **Verification** - Did the step work
|
||||
4. **Completion loop** - Keep going until done
|
||||
|
||||
## Solution
|
||||
|
||||
Add **Task Completion Workflow** to instructions:
|
||||
|
||||
```
|
||||
**TASK COMPLETION WORKFLOW (MANDATORY):**
|
||||
|
||||
**1. PLAN:** List ALL steps needed before starting
|
||||
**2. CHECK:** Use ls to verify what exists before creating
|
||||
**3. EXECUTE:** Run first step
|
||||
**4. VERIFY:** Confirm step worked (ls, read file)
|
||||
**5. REPEAT:** Steps 3-4 until ALL complete
|
||||
**6. FINAL CHECK:** Verify entire task is done
|
||||
**7. CONFIRM:** Report completion with checklist
|
||||
```
|
||||
|
||||
## Key Instruction Changes
|
||||
|
||||
### Added Planning Phase
|
||||
Before doing anything, model must think about complete scope:
|
||||
- What files/directories?
|
||||
- What dependencies?
|
||||
- Complete task requirements
|
||||
|
||||
### Added Verification Steps
|
||||
Every step must be verified:
|
||||
- `ls -la` after mkdir
|
||||
- `read` file after write
|
||||
- Check content is correct
|
||||
|
||||
### Added Completion Loop
|
||||
Model must continue until:
|
||||
✓ All directories exist
|
||||
✓ All files exist with correct content
|
||||
✓ All dependencies installed
|
||||
✓ Each component verified
|
||||
|
||||
### Complete Working Example
|
||||
Provided 13-step React example showing:
|
||||
1. Check existing (ls)
|
||||
2. Create directory
|
||||
3. Verify created (ls)
|
||||
4. Create package.json
|
||||
5. Verify package.json (read)
|
||||
6. Create source files
|
||||
7. Final verification (find myapp -type f)
|
||||
8. Install dependencies
|
||||
9. Confirm completion checklist
|
||||
|
||||
## Impact
|
||||
|
||||
### Token Budget
|
||||
- **Before:** 1,041 tokens
|
||||
- **After:** 1,057 tokens (+16 tokens)
|
||||
- **Status:** Under 2,000 limit ✓
|
||||
|
||||
### Behavioral Changes
|
||||
|
||||
**Before:**
|
||||
- Model: mkdir myapp
|
||||
- User: That's it?
|
||||
- Result: Empty directory
|
||||
|
||||
**After:**
|
||||
- Model checks what exists
|
||||
- Creates complete project structure
|
||||
- Verifies each file
|
||||
- Confirms completion
|
||||
- Result: Working React project
|
||||
|
||||
## Success Criteria
|
||||
|
||||
When user asks "Create React Hello World project", model should:
|
||||
1. ✓ Check current directory contents
|
||||
2. ✓ Create myapp/ directory
|
||||
3. ✓ Verify directory created
|
||||
4. ✓ Create package.json
|
||||
5. ✓ Verify package.json content
|
||||
6. ✓ Create src/App.js
|
||||
7. ✓ Create src/index.js
|
||||
8. ✓ Create public/index.html
|
||||
9. ✓ Final verification (list all files)
|
||||
10. ✓ npm install
|
||||
11. ✓ Confirm completion checklist
|
||||
|
||||
## Testing
|
||||
|
||||
Test instructions contain:
|
||||
- PLAN/CHECK keywords
|
||||
- VERIFY keyword
|
||||
- COMPLETE keyword
|
||||
|
||||
All tests pass: 11/11 ✓
|
||||
|
||||
## Trade-offs
|
||||
|
||||
**Pros:**
|
||||
- Complete task execution
|
||||
- Verification prevents partial work
|
||||
- Clear completion criteria
|
||||
- Better user experience
|
||||
|
||||
**Cons:**
|
||||
- More tokens (but still under limit)
|
||||
- More verbose instructions
|
||||
- May be slower (more verification steps)
|
||||
|
||||
## Related Files Changed
|
||||
|
||||
1. src/api/routes.py - Updated tool_instructions
|
||||
2. tests/test_tool_parsing.py - Updated tests for new content
|
||||
3. docs/design/2024-02-24-task-planning-verification.md - This doc
|
||||
|
||||
## Future Improvements
|
||||
|
||||
1. **Task Queue System:** Server-side queue of pending operations
|
||||
2. **State Persistence:** Remember what's been done across conversations
|
||||
3. **Smart Resumption:** If interrupted, pick up where left off
|
||||
4. **Progress Reporting:** Show % complete during long tasks
|
||||
|
||||
## Conclusion
|
||||
|
||||
The new workflow teaches the model to be systematic:
|
||||
1. Plan before acting
|
||||
2. Check before creating
|
||||
3. Verify after each step
|
||||
4. Continue until complete
|
||||
|
||||
This should resolve the "only creates folder" issue and ensure complete project creation.
|
||||
@@ -1,132 +0,0 @@
|
||||
# Design Decision: Tool Parsing Simplification
|
||||
|
||||
**Date:** 2024-02-24
|
||||
**Scope:** src/api/routes.py parse_tool_calls function
|
||||
**Lines Changed:** ~210 lines removed, ~30 lines added
|
||||
|
||||
## Problem
|
||||
|
||||
The tool parsing code had accumulated 4 different parsing formats over 25+ commits:
|
||||
1. JSON `tool_calls` format with nested objects
|
||||
2. TOOL:/ARGUMENTS: format (simple text)
|
||||
3. Function pattern format `func_name(args)`
|
||||
4. Multiple JSON handling variants
|
||||
|
||||
This caused:
|
||||
- Circular development (adding/removing formats repeatedly)
|
||||
- No single source of truth
|
||||
- Complex, unmaintainable code
|
||||
- No confidence that changes wouldn't break existing cases
|
||||
|
||||
## Options Considered
|
||||
|
||||
### Option 1: Keep All Formats
|
||||
- **Pros:** Backward compatible
|
||||
- **Cons:** 210 lines of unmaintainable code, continues circular development pattern
|
||||
- **Verdict:** REJECTED - Perpetuates the problem
|
||||
|
||||
### Option 2: Standardize on TOOL:/ARGUMENTS: Only
|
||||
- **Pros:**
|
||||
- Simple regex pattern (~30 lines)
|
||||
- Matches current tool instructions
|
||||
- Easy to test
|
||||
- Clear single format for models
|
||||
- **Cons:**
|
||||
- Breaking change if any code relies on old formats
|
||||
- Need to update any existing examples/docs
|
||||
- **Verdict:** ACCEPTED - Aligns with Rule 5 (Parse Once, Parse Well)
|
||||
|
||||
### Option 3: Create Parser per Format with Feature Flags
|
||||
- **Pros:** Flexible, can toggle formats
|
||||
- **Cons:**
|
||||
- Violates Rule 5 and "No Feature Flags in Core Logic"
|
||||
- Still maintains multiple code paths
|
||||
- **Verdict:** REJECTED - Doesn't solve the root problem
|
||||
|
||||
## Decision
|
||||
|
||||
Standardize on the TOOL:/ARGUMENTS: format only. Remove all other parsing code.
|
||||
|
||||
**Rationale:**
|
||||
- Per DEVELOPMENT_PATTERNS.md recommendation #3: "One Format Only"
|
||||
- Token cost is minimal (no complex regex)
|
||||
- Test coverage provides confidence
|
||||
- Aligns with existing tool instructions
|
||||
|
||||
## Impact
|
||||
|
||||
### Token Count
|
||||
- **Parser code:** 210 lines → 30 lines (-180 lines)
|
||||
- **No change** to tool instructions (separate optimization)
|
||||
|
||||
### Breaking Changes
|
||||
- **Yes** - Removes support for:
|
||||
- JSON `tool_calls` format in model responses
|
||||
- Function pattern format `read_file(path="test.txt")`
|
||||
|
||||
**Migration:** Models must use:
|
||||
```
|
||||
TOOL: read
|
||||
ARGUMENTS: {"filePath": "test.txt"}
|
||||
```
|
||||
|
||||
### Testing
|
||||
- Unit tests added: 9 test cases
|
||||
- Coverage: All parsing scenarios
|
||||
- All tests pass
|
||||
|
||||
## Implementation
|
||||
|
||||
```python
|
||||
# New implementation (30 lines)
|
||||
def parse_tool_calls(text: str) -> tuple:
|
||||
"""Parse tool calls using standardized format."""
|
||||
import json
|
||||
import re
|
||||
|
||||
tool_pattern = r'TOOL:\s*(\w+)\s*\nARGUMENTS:\s*(\{[^}]*\})'
|
||||
tool_matches = list(re.finditer(tool_pattern, text, re.IGNORECASE))
|
||||
|
||||
if not tool_matches:
|
||||
return text, None
|
||||
|
||||
tool_calls = []
|
||||
for i, tool_match in enumerate(tool_matches):
|
||||
tool_name = tool_match.group(1)
|
||||
args_str = tool_match.group(2)
|
||||
try:
|
||||
args_dict = json.loads(args_str)
|
||||
tool_calls.append({
|
||||
"id": f"call_{i+1}",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tool_name,
|
||||
"arguments": json.dumps(args_dict)
|
||||
}
|
||||
})
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if not tool_calls:
|
||||
return text, None
|
||||
|
||||
first_start = tool_matches[0].start()
|
||||
content = text[:first_start].strip()
|
||||
|
||||
return content, tool_calls
|
||||
```
|
||||
|
||||
## Verification
|
||||
|
||||
Run tests:
|
||||
```bash
|
||||
python tests/test_tool_parsing.py
|
||||
```
|
||||
|
||||
Expected: 9 passed, 0 failed
|
||||
|
||||
## Follow-up
|
||||
|
||||
- [x] Update DEVELOPMENT_PATTERNS.md to mark as completed
|
||||
- [x] Add unit tests
|
||||
- [ ] Consider integration test for full tool execution flow
|
||||
@@ -1,112 +0,0 @@
|
||||
# Test Plan: Fix Tool Execution and Token Reporting
|
||||
|
||||
## Problem Analysis
|
||||
|
||||
### Issue 1: Model Gives Instructions Instead of Executing
|
||||
**Current behavior:** Model describes what to do ("You should run mkdir...") instead of using TOOL: format
|
||||
**Expected:** Model responds with TOOL: bash\nARGUMENTS: {"command": "mkdir..."}
|
||||
|
||||
### Issue 2: Token Counting Inaccurate
|
||||
**Current:** Rough estimate `len(prompt) // 4`
|
||||
**Expected:** Accurate token count using tiktoken
|
||||
**Impact:** opencode can't properly manage context window
|
||||
|
||||
### Issue 3: npx Commands Timeout/Need Input
|
||||
**Current:** `npx create-react-app .` prompts for confirmation (y/n)
|
||||
**Expected:** Non-interactive execution or manual file creation
|
||||
**Evidence:** "Need to install the following packages: create-react-app@5.1.0 Ok to proceed? (y)"
|
||||
|
||||
## Unit Tests
|
||||
|
||||
### Test 1: Accurate Token Counting
|
||||
- [ ] Verify token count uses tiktoken (not rough estimate)
|
||||
- [ ] Test with known token counts
|
||||
- [ ] Verify prompt_tokens + completion_tokens = total_tokens
|
||||
|
||||
### Test 2: Non-Interactive Bash Commands
|
||||
- [ ] Verify npm/npx commands use --yes or equivalent flags
|
||||
- [ ] Test timeout handling for package managers
|
||||
- [ ] Verify commands don't prompt for user input
|
||||
|
||||
### Test 3: Tool Instructions Content
|
||||
- [ ] Verify instructions emphasize "EXECUTE not DESCRIBE"
|
||||
- [ ] Verify manual file creation examples (not npx)
|
||||
- [ ] Verify anti-patterns are clearly stated
|
||||
|
||||
## Integration Tests
|
||||
|
||||
### Test 4: End-to-End React Project Creation
|
||||
**Input:** "Create a React Hello World app"
|
||||
|
||||
**Expected Flow:**
|
||||
1. TOOL: bash, ARGUMENTS: {"command": "mkdir myapp"}
|
||||
2. TOOL: write, ARGUMENTS: {"filePath": "myapp/package.json", "content": "..."}
|
||||
3. TOOL: write, ARGUMENTS: {"filePath": "myapp/src/App.js", "content": "..."}
|
||||
4. Continue until complete
|
||||
|
||||
**Failure Modes:**
|
||||
- [ ] Model describes steps instead of executing
|
||||
- [ ] Uses npx create-react-app (should manually create files)
|
||||
- [ ] Stops after README only
|
||||
|
||||
### Test 5: Token Reporting Accuracy
|
||||
**Input:** Any chat completion request
|
||||
|
||||
**Expected:**
|
||||
- usage.prompt_tokens matches actual tokens
|
||||
- usage.completion_tokens matches actual tokens
|
||||
- usage.total_tokens is sum
|
||||
|
||||
**Verification:**
|
||||
- Compare tiktoken count vs API response
|
||||
|
||||
## Manual Verification
|
||||
|
||||
```bash
|
||||
# Test React creation
|
||||
python main.py --auto &
|
||||
curl -X POST http://localhost:17615/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Client-Working-Dir: /tmp/test-project" \
|
||||
-d '{
|
||||
"model": "local-swarm",
|
||||
"messages": [{"role": "user", "content": "Create a React Hello World app"}],
|
||||
"tools": [{"type": "function", "function": {"name": "bash"}}, {"type": "function", "function": {"name": "write"}}]
|
||||
}'
|
||||
|
||||
# Check token accuracy
|
||||
curl -X POST http://localhost:17615/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "local-swarm",
|
||||
"messages": [{"role": "user", "content": "Hello"}]
|
||||
}' | jq '.usage'
|
||||
```
|
||||
|
||||
## Success Criteria
|
||||
|
||||
1. **Execution:** 100% of requests use TOOL: format (not descriptions)
|
||||
2. **Accuracy:** Token counts match tiktoken within ±5%
|
||||
3. **Completion:** Multi-file projects fully created via write tool
|
||||
4. **No npx:** Manual file creation for React (no npx create-react-app)
|
||||
|
||||
## Implementation Notes
|
||||
|
||||
### Token Counting Fix
|
||||
```python
|
||||
# Replace: prompt_tokens = len(prompt) // 4
|
||||
# With:
|
||||
import tiktoken
|
||||
encoding = tiktoken.get_encoding('cl100k_base')
|
||||
prompt_tokens = len(encoding.encode(prompt))
|
||||
completion_tokens = len(encoding.encode(content))
|
||||
```
|
||||
|
||||
### Tool Instructions Fix
|
||||
- Add explicit "DO NOT USE npx create-react-app" instruction
|
||||
- Add "EXECUTE IMMEDIATELY" mandate
|
||||
- Show complete React example with manual file creation
|
||||
|
||||
### Non-Interactive Commands
|
||||
- Auto-add --yes to npx commands
|
||||
- Or recommend manual file creation instead
|
||||
@@ -1,97 +0,0 @@
|
||||
# Test Plan: Improved Tool Instructions
|
||||
|
||||
## Problem Statement
|
||||
Model is not using tools effectively:
|
||||
1. Creates README instead of actual project structure
|
||||
2. Provides commands as text instead of executing them
|
||||
3. Refuses to run commands claiming "I am only an AI assistant"
|
||||
|
||||
## Root Cause Analysis
|
||||
Current instructions don't clearly communicate:
|
||||
- That the model SHOULD use tools proactively
|
||||
- That execution is expected, not explanation
|
||||
- The workflow: user request → tool execution → result
|
||||
|
||||
## Unit Tests (Instruction Verification)
|
||||
|
||||
### Test 1: Instruction Presence
|
||||
- [ ] Verify instructions are injected into system message
|
||||
- [ ] Verify instructions appear at the START of system message (priority position)
|
||||
|
||||
### Test 2: Token Count
|
||||
- [ ] Measure total token count of new instructions
|
||||
- [ ] Verify ≤ 500 tokens (conservative budget)
|
||||
- [ ] Document before/after
|
||||
|
||||
### Test 3: Format Compliance
|
||||
- [ ] Verify instructions include TOOL:/ARGUMENTS: format
|
||||
- [ ] Verify examples use correct format
|
||||
- [ ] Verify rules are clear and numbered
|
||||
|
||||
## Integration Tests (Behavioral)
|
||||
|
||||
### Test 4: Project Creation Flow
|
||||
**Input:** "Create a React Hello World app"
|
||||
|
||||
**Expected Behavior:**
|
||||
1. Model responds with TOOL: bash, ARGUMENTS: mkdir myapp
|
||||
2. After result, TOOL: write, ARGUMENTS: package.json content
|
||||
3. After result, TOOL: write, ARGUMENTS: src/App.js content
|
||||
4. Continue until complete project structure exists
|
||||
|
||||
**Failure Modes:**
|
||||
- [ ] Model only describes what to do
|
||||
- [ ] Model creates README only
|
||||
- [ ] Model refuses to execute commands
|
||||
|
||||
### Test 5: Multi-step Task
|
||||
**Input:** "Check what files exist, then create a test.txt file with 'hello' in it"
|
||||
|
||||
**Expected Behavior:**
|
||||
1. TOOL: bash, ARGUMENTS: ls -la
|
||||
2. Wait for result
|
||||
3. TOOL: write, ARGUMENTS: test.txt with "hello"
|
||||
|
||||
**Failure Modes:**
|
||||
- [ ] Model tries to do both in one response
|
||||
- [ ] Model doesn't wait for ls result before writing
|
||||
|
||||
### Test 6: Command Refusal
|
||||
**Input:** "Run npm install"
|
||||
|
||||
**Expected Behavior:**
|
||||
1. TOOL: bash, ARGUMENTS: npm install
|
||||
|
||||
**Failure Modes:**
|
||||
- [ ] Model responds: "I cannot run commands, I am only an AI assistant"
|
||||
- [ ] Model explains npm install instead of running it
|
||||
|
||||
## Manual Verification Commands
|
||||
|
||||
```bash
|
||||
# Start the server
|
||||
python main.py --auto
|
||||
|
||||
# In another terminal, test with curl
|
||||
curl -X POST http://localhost:17615/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "local-swarm",
|
||||
"messages": [{"role": "user", "content": "Create a React Hello World app"}],
|
||||
"tools": [{"type": "function", "function": {"name": "bash", "description": "Run shell commands"}}, {"type": "function", "function": {"name": "write", "description": "Write files"}}]
|
||||
}'
|
||||
```
|
||||
|
||||
## Success Criteria
|
||||
|
||||
1. **Proactivity:** Model uses tools without being asked twice
|
||||
2. **Execution:** Model runs commands, doesn't just describe them
|
||||
3. **No Refusal:** Model never says "I cannot" or "I am only an AI"
|
||||
4. **Completeness:** Multi-file projects are fully created via tools
|
||||
5. **Format:** 100% of tool calls use correct TOOL:/ARGUMENTS: format
|
||||
|
||||
## Metrics
|
||||
|
||||
- **Tool usage rate:** % of requests that result in tool calls
|
||||
- **Format compliance:** % of tool calls in correct format
|
||||
- **Completion rate:** % of multi-step tasks fully completed
|
||||
@@ -1,35 +0,0 @@
|
||||
# Test Plan: Tool Parsing Simplification
|
||||
|
||||
## Unit Tests
|
||||
|
||||
- [x] Test case 1: Single tool call → Returns 1 tool with correct name and arguments
|
||||
- [x] Test case 2: No tool in text → Returns None for tools, original text as content
|
||||
- [x] Test case 3: Multiple tools → Returns all tools in order
|
||||
- [x] Test case 4: Content before tool → Content extracted, tool parsed correctly
|
||||
- [x] Test case 5: Bash tool → Correctly parses bash command
|
||||
- [x] Test case 6: Case insensitive → "tool:" and "TOOL:" both work
|
||||
- [x] Test case 7: Invalid JSON → Skips invalid, continues with valid
|
||||
- [x] Test case 8: Empty text → Returns None, empty string
|
||||
- [x] Test case 9: Whitespace only → Returns None
|
||||
|
||||
## Integration Tests
|
||||
|
||||
- [ ] End-to-end flow:
|
||||
1. Send chat completion request with tools
|
||||
2. Model responds with TOOL:/ARGUMENTS: format
|
||||
3. Parser extracts tool call
|
||||
4. Tool executes
|
||||
5. Result returned in response
|
||||
|
||||
- [ ] Expected result: Tool executes successfully, result included in response
|
||||
|
||||
## Manual Verification
|
||||
|
||||
- [ ] Command: `python tests/test_tool_parsing.py`
|
||||
- [ ] Expected output: "9 passed, 0 failed"
|
||||
|
||||
## Token Budget Verification
|
||||
|
||||
- Parser code: ~30 lines (~200 tokens)
|
||||
- Well under 2000 token limit
|
||||
- Simple regex pattern maintains low complexity
|
||||
@@ -10,218 +10,63 @@ import sys
|
||||
import multiprocessing as mp
|
||||
|
||||
# CRITICAL: Set spawn method BEFORE any other imports on macOS
|
||||
# This prevents fork-related issues with Metal GPU
|
||||
if sys.platform == "darwin":
|
||||
try:
|
||||
mp.set_start_method("spawn", force=True)
|
||||
except RuntimeError:
|
||||
pass # Already set
|
||||
pass
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path - resolve for Windows compatibility
|
||||
# Add src to path
|
||||
src_path = Path(__file__).parent.resolve() / "src"
|
||||
sys.path.insert(0, str(src_path))
|
||||
|
||||
# Also add parent dir for Windows import issues
|
||||
if str(Path(__file__).parent.resolve()) not in sys.path:
|
||||
sys.path.insert(0, str(Path(__file__).parent.resolve()))
|
||||
|
||||
# These imports must come AFTER setting spawn method on macOS
|
||||
from hardware.detector import detect_hardware
|
||||
from models.selector import select_optimal_model
|
||||
from models.downloader import download_model_for_config
|
||||
from swarm import SwarmManager
|
||||
from api import create_server
|
||||
from api.routes import set_federated_swarm
|
||||
from mcp_server import create_mcp_server
|
||||
from interactive import (
|
||||
interactive_model_selection,
|
||||
show_startup_summary,
|
||||
show_runtime_menu,
|
||||
custom_configuration,
|
||||
)
|
||||
from network import create_discovery_service, FederatedSwarm
|
||||
from tools.executor import ToolExecutor, set_tool_executor
|
||||
from cli.parser import parse_args
|
||||
from cli.tool_server import run_tool_server
|
||||
from utils.network import get_local_ip
|
||||
from utils.logging_config import setup_logging
|
||||
from hardware.detector import detect_hardware
|
||||
from interactive import print_hardware_info
|
||||
|
||||
# Set up logging (DEBUG level for development)
|
||||
# Set up logging
|
||||
setup_logging()
|
||||
|
||||
|
||||
async def setup_swarm(model_config, hardware):
|
||||
"""Download model and initialize swarm."""
|
||||
# Download model
|
||||
print("\n⬇️ Downloading model...")
|
||||
try:
|
||||
model_path = download_model_for_config(model_config)
|
||||
print(f"✓ Model ready at: {model_path}")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error downloading model: {e}", file=sys.stderr)
|
||||
return None
|
||||
def handle_detect_mode(hardware) -> int:
|
||||
"""Handle --detect mode."""
|
||||
print_hardware_info(hardware)
|
||||
print("\n✅ Detection complete")
|
||||
return 0
|
||||
|
||||
|
||||
def handle_tool_server_mode(args, hardware) -> int:
|
||||
"""Handle --tool-server mode."""
|
||||
print("\n🔧 Starting Tool Execution Server...")
|
||||
host = args.host if args.host else get_local_ip()
|
||||
|
||||
# Initialize swarm
|
||||
print("\n🚀 Initializing swarm...")
|
||||
try:
|
||||
swarm = SwarmManager(
|
||||
model_config=model_config,
|
||||
hardware=hardware,
|
||||
consensus_strategy="similarity"
|
||||
)
|
||||
|
||||
success = await swarm.initialize(str(model_path))
|
||||
if not success:
|
||||
print("❌ Failed to initialize swarm")
|
||||
return None
|
||||
|
||||
return swarm
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error initializing swarm: {e}", file=sys.stderr)
|
||||
return None
|
||||
asyncio.run(run_tool_server(host, args.tool_port))
|
||||
return 0
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nTool server stopped")
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
def get_local_ip():
|
||||
"""Get the local network IP address (private networks only)."""
|
||||
import socket
|
||||
try:
|
||||
# Create a socket and connect to a public DNS server
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
s.settimeout(2)
|
||||
# Try to connect to Google's DNS - this doesn't actually send data
|
||||
s.connect(("8.8.8.8", 80))
|
||||
ip = s.getsockname()[0]
|
||||
s.close()
|
||||
|
||||
# Check if it's a private IP (only 192.168.x.x for this network)
|
||||
is_private = (
|
||||
ip.startswith('192.168.')
|
||||
)
|
||||
|
||||
if is_private:
|
||||
print(f" 📡 Detected local IP: {ip}")
|
||||
return ip
|
||||
else:
|
||||
# If not private, return localhost for safety
|
||||
print(f" ⚠️ IP {ip} is not a private network, binding to localhost")
|
||||
return "127.0.0.1"
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Could not detect local IP: {e}, using localhost")
|
||||
return "127.0.0.1"
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Local Swarm - AI-powered coding LLM swarm",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python main.py # Interactive setup and start
|
||||
python main.py --auto # Auto-detect and start without menu
|
||||
python main.py --detect # Show hardware detection only
|
||||
python main.py --model qwen:3b:q4 # Use specific model (skip menu)
|
||||
python main.py --port 17615 # Use custom port (default: 17615)
|
||||
python main.py --host 192.168.1.5 # Bind to specific IP
|
||||
python main.py --instances 4 # Force number of instances
|
||||
python main.py --download-only # Download model only
|
||||
python main.py --test # Test with sample prompt
|
||||
python main.py --mcp # Enable MCP server
|
||||
python main.py --federation # Enable federation with other instances
|
||||
python main.py --federation --peer 192.168.1.10:17615 # Manual peer
|
||||
"""
|
||||
)
|
||||
async def run_main_mode(args, hardware) -> int:
|
||||
"""Run the main application mode."""
|
||||
from cli.main_runner import MainRunner
|
||||
|
||||
parser.add_argument(
|
||||
"--auto",
|
||||
action="store_true",
|
||||
help="Auto-detect best configuration without interactive menu"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--detect",
|
||||
action="store_true",
|
||||
help="Show hardware detection and exit"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
help="Model to use (format: name:size:quant, e.g., qwen:3b:q4)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=17615,
|
||||
help="Port to run the API server on (default: 17615)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--instances",
|
||||
type=int,
|
||||
help="Force number of instances (overrides auto-calculation)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--download-only",
|
||||
action="store_true",
|
||||
help="Download models only, don't start server"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
action="store_true",
|
||||
help="Test with a sample prompt"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mcp",
|
||||
action="store_true",
|
||||
help="Enable MCP server alongside HTTP API"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
default="config.yaml",
|
||||
help="Path to config file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--host",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Host IP to bind to (default: auto-detect)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--federation",
|
||||
action="store_true",
|
||||
help="Enable federation with other Local Swarm instances on the network"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--peer",
|
||||
action="append",
|
||||
dest="peers",
|
||||
help="Manually add a peer (format: host:port, can be used multiple times)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tool-server",
|
||||
action="store_true",
|
||||
help="Run as dedicated tool execution server (executes read/write/bash tools)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tool-port",
|
||||
type=int,
|
||||
default=17616,
|
||||
help="Port for tool execution server (default: 17616)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tool-host",
|
||||
type=str,
|
||||
default=None,
|
||||
nargs='?',
|
||||
const='', # When --tool-host is used without a value, use empty string
|
||||
help="URL of tool execution server. Use without value for auto-detected local IP (http://<local-ip>:17616), or provide explicit URL."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
action="version",
|
||||
version="%(prog)s 0.1.0"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
runner = MainRunner(hardware, args)
|
||||
return await runner.run()
|
||||
|
||||
|
||||
def main() -> int:
|
||||
"""Main entry point."""
|
||||
args = parse_args()
|
||||
|
||||
# Detect hardware first
|
||||
print("\n🔍 Detecting hardware...")
|
||||
@@ -229,316 +74,26 @@ Examples:
|
||||
hardware = detect_hardware()
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error detecting hardware: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
return 1
|
||||
|
||||
# Handle detect mode
|
||||
if args.detect:
|
||||
# Just show hardware info
|
||||
from interactive import print_hardware_info
|
||||
print_hardware_info(hardware)
|
||||
print("\n✅ Detection complete")
|
||||
return
|
||||
return handle_detect_mode(hardware)
|
||||
|
||||
# Tool server mode - run minimal tool-only server
|
||||
# Handle tool server mode
|
||||
if args.tool_server:
|
||||
print("\n🔧 Starting Tool Execution Server...")
|
||||
from fastapi import FastAPI
|
||||
import uvicorn
|
||||
|
||||
# Initialize local tool executor
|
||||
tool_executor = ToolExecutor(tool_host_url=None)
|
||||
set_tool_executor(tool_executor)
|
||||
|
||||
app = FastAPI(title="Local Swarm Tool Server")
|
||||
|
||||
@app.post("/v1/tools/execute")
|
||||
async def execute_tool(request: dict):
|
||||
tool_name = request.get("tool", "")
|
||||
tool_args = request.get("arguments", {})
|
||||
result = await tool_executor.execute(tool_name, tool_args)
|
||||
return {"result": result}
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "healthy", "mode": "tool-server"}
|
||||
|
||||
host = args.host if args.host else get_local_ip()
|
||||
tool_port = args.tool_port
|
||||
print(f"🔗 Tool server running at http://{host}:{tool_port}")
|
||||
print(f" Endpoints:")
|
||||
print(f" - POST /v1/tools/execute")
|
||||
print(f" - GET /health")
|
||||
print(f"\n✅ Tool server ready!")
|
||||
|
||||
uvicorn.run(app, host=host, port=tool_port)
|
||||
return
|
||||
return handle_tool_server_mode(args, hardware)
|
||||
|
||||
# Determine model configuration
|
||||
config = None
|
||||
|
||||
if args.model or args.instances or args.auto:
|
||||
# Use command-line arguments or auto-detect
|
||||
print("\n📊 Calculating optimal configuration...")
|
||||
try:
|
||||
config = select_optimal_model(
|
||||
hardware,
|
||||
preferred_model=args.model,
|
||||
force_instances=args.instances
|
||||
)
|
||||
|
||||
if not config:
|
||||
print("\n❌ No suitable model found for your hardware")
|
||||
print(" Minimum requirement: 2 GB available memory")
|
||||
sys.exit(1)
|
||||
|
||||
# Show brief summary
|
||||
print(f"\n✓ Selected: {config.display_name}")
|
||||
print(f" Instances: {config.instances}")
|
||||
print(f" Memory: {config.total_memory_gb:.1f} GB")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error selecting model: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
# Interactive mode - show menu
|
||||
config = interactive_model_selection(hardware)
|
||||
|
||||
if not config:
|
||||
print("\n❌ No configuration selected")
|
||||
sys.exit(1)
|
||||
|
||||
if args.download_only:
|
||||
# Download model only
|
||||
print("\n" + "=" * 70)
|
||||
print("⬇️ Download Mode: Downloading model only")
|
||||
print("=" * 70)
|
||||
|
||||
try:
|
||||
model_path = download_model_for_config(config)
|
||||
print(f"✓ Model downloaded to: {model_path}")
|
||||
print("\n" + "=" * 70)
|
||||
print("✅ Download complete")
|
||||
print("=" * 70)
|
||||
except Exception as e:
|
||||
print(f"\n❌ Download failed: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
elif args.test:
|
||||
# Test mode with sample prompt
|
||||
print("\n" + "=" * 70)
|
||||
print("🧪 Test Mode: Running sample inference")
|
||||
print("=" * 70)
|
||||
|
||||
async def test_inference():
|
||||
show_startup_summary(hardware, config)
|
||||
swarm = await setup_swarm(config, hardware)
|
||||
if not swarm:
|
||||
return False
|
||||
|
||||
try:
|
||||
# Test prompt
|
||||
prompt = "Write a Python function to calculate factorial:"
|
||||
print(f"\nPrompt: {prompt}\n")
|
||||
print("Generating responses...\n")
|
||||
|
||||
result = await swarm.generate(prompt, max_tokens=200)
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("SELECTED RESPONSE:")
|
||||
print("=" * 70)
|
||||
print(result.selected_response.text)
|
||||
print("\n" + "=" * 70)
|
||||
print(f"Strategy: {result.strategy}")
|
||||
print(f"Confidence: {result.confidence:.2f}")
|
||||
print(f"Latency: {result.selected_response.latency_ms:.1f}ms")
|
||||
print(f"Tokens/sec: {result.selected_response.tokens_per_second:.1f}")
|
||||
|
||||
# Show all responses
|
||||
print("\nAll responses received:")
|
||||
for i, resp in enumerate(result.all_responses):
|
||||
preview = resp.text[:60].replace('\n', ' ')
|
||||
print(f" Worker {i}: {preview}... ({resp.latency_ms:.1f}ms)")
|
||||
|
||||
return True
|
||||
finally:
|
||||
await swarm.shutdown()
|
||||
|
||||
success = asyncio.run(test_inference())
|
||||
|
||||
if success:
|
||||
print("\n" + "=" * 70)
|
||||
print("✅ Test complete")
|
||||
print("=" * 70)
|
||||
else:
|
||||
print("\n❌ Test failed")
|
||||
sys.exit(1)
|
||||
|
||||
else:
|
||||
# Full mode (download + start API server + optional MCP)
|
||||
show_startup_summary(hardware, config)
|
||||
|
||||
async def run_server():
|
||||
swarm = await setup_swarm(config, hardware)
|
||||
if not swarm:
|
||||
return False
|
||||
|
||||
# Initialize tool executor
|
||||
if args.tool_host is not None:
|
||||
# --tool-host was provided
|
||||
if args.tool_host == "":
|
||||
# --tool-host with no value - use local IP with default port
|
||||
local_ip = get_local_ip()
|
||||
tool_host_url = f"http://{local_ip}:17616"
|
||||
print(f"\n🔧 Using remote tool host: {tool_host_url} (auto-detected local IP)")
|
||||
else:
|
||||
# --tool-host with explicit value
|
||||
tool_host_url = args.tool_host
|
||||
print(f"\n🔧 Using remote tool host: {tool_host_url}")
|
||||
tool_executor = ToolExecutor(tool_host_url=tool_host_url)
|
||||
set_tool_executor(tool_executor)
|
||||
else:
|
||||
# Local tool execution (default)
|
||||
tool_executor = ToolExecutor(tool_host_url=None)
|
||||
set_tool_executor(tool_executor)
|
||||
|
||||
# Update summary with runtime info
|
||||
show_startup_summary(hardware, config, swarm)
|
||||
|
||||
# Initialize federation if enabled
|
||||
discovery = None
|
||||
federated_swarm = None
|
||||
if args.federation:
|
||||
print("\n🌐 Initializing federation...")
|
||||
try:
|
||||
# Use specified host for advertising if provided
|
||||
advertise_ip = args.host if args.host else None
|
||||
discovery = await create_discovery_service(args.port, advertise_ip=advertise_ip)
|
||||
|
||||
# Get swarm info for advertising
|
||||
swarm_info = {
|
||||
"version": "0.1.0",
|
||||
"instances": config.instances,
|
||||
"model_id": config.model_id,
|
||||
"hardware_summary": f"{hardware.cpu_cores} CPU, {hardware.ram_gb:.1f}GB RAM"
|
||||
}
|
||||
|
||||
await discovery.start_advertising(swarm_info)
|
||||
await discovery.start_listening()
|
||||
|
||||
# Add manual peers if specified
|
||||
if args.peers:
|
||||
print(f" 📍 Adding {len(args.peers)} manual peer(s)...")
|
||||
from network.discovery import PeerInfo
|
||||
from datetime import datetime
|
||||
for peer_str in args.peers:
|
||||
try:
|
||||
host, port = peer_str.rsplit(':', 1)
|
||||
port = int(port)
|
||||
peer = PeerInfo(
|
||||
host=host,
|
||||
port=port,
|
||||
name=f"manual_{host}_{port}",
|
||||
version="0.1.0",
|
||||
instances=0,
|
||||
model_id="unknown",
|
||||
hardware_summary="manual",
|
||||
last_seen=datetime.now()
|
||||
)
|
||||
discovery.peers[peer.name] = peer
|
||||
print(f" ✓ Added peer: {host}:{port}")
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Failed to add peer {peer_str}: {e}")
|
||||
|
||||
# Create federated swarm wrapper
|
||||
federated_swarm = FederatedSwarm(swarm, discovery)
|
||||
set_federated_swarm(federated_swarm)
|
||||
|
||||
# Start health check loop in background
|
||||
asyncio.create_task(discovery.start_health_check_loop(interval_seconds=10))
|
||||
|
||||
print(f" ✓ Federation enabled")
|
||||
print(f" ✓ Discovery active on port {discovery.discovery_port}")
|
||||
print(f" ✓ Peer health checks every 10s")
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Failed to initialize federation: {e}")
|
||||
print(" Continuing without federation...")
|
||||
|
||||
mcp_server = None
|
||||
try:
|
||||
# Create and start API server
|
||||
print("\n🌐 Starting HTTP API server...")
|
||||
# Use provided host or auto-detect
|
||||
if args.host:
|
||||
host = args.host
|
||||
print(f"🔗 Using specified host: {host}:{args.port}")
|
||||
else:
|
||||
# Use local network IP instead of 0.0.0.0 for security
|
||||
host = get_local_ip()
|
||||
print(f"🔗 Binding to {host}:{args.port}")
|
||||
server = create_server(swarm, host=host, port=args.port)
|
||||
|
||||
print(f"\n✅ Local Swarm is running!")
|
||||
print(f" API: http://{host}:{args.port}/v1")
|
||||
print(f" Health: http://{host}:{args.port}/health")
|
||||
|
||||
if args.federation and discovery:
|
||||
peers = discovery.get_peers()
|
||||
print(f"\n🌐 Federation: Enabled")
|
||||
print(f" Discovery port: {discovery.discovery_port}")
|
||||
if peers:
|
||||
print(f" Peers discovered: {len(peers)}")
|
||||
for peer in peers:
|
||||
print(f" - {peer.name} ({peer.model_id})")
|
||||
else:
|
||||
print(f" Peers discovered: 0 (waiting for peers...)")
|
||||
|
||||
# Show tool server status
|
||||
if args.tool_host is not None:
|
||||
print(f"\n🔧 Tool Server: Remote")
|
||||
if args.tool_host == "":
|
||||
local_ip = get_local_ip()
|
||||
print(f" URL: http://{local_ip}:17616 (auto-detected)")
|
||||
else:
|
||||
print(f" URL: {args.tool_host}")
|
||||
print(f" Mode: Tools executed remotely on tool host")
|
||||
else:
|
||||
print(f"\n🔧 Tool Server: Local")
|
||||
print(f" Mode: Tools executed on this machine")
|
||||
|
||||
if args.mcp:
|
||||
# Start MCP server alongside HTTP API
|
||||
print("\n🤖 Starting MCP server...")
|
||||
mcp_server = await create_mcp_server(swarm)
|
||||
print(" MCP server active (stdio)")
|
||||
|
||||
print(f"\n💡 Configure opencode to use:")
|
||||
print(f' base_url: http://127.0.0.1:{args.port}/v1')
|
||||
print(f' api_key: any (not used)')
|
||||
print(f"\nPress Ctrl+C to stop...\n")
|
||||
|
||||
# Start HTTP server (this will block)
|
||||
await server.start()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nReceived stop signal")
|
||||
finally:
|
||||
if federated_swarm:
|
||||
await federated_swarm.close()
|
||||
if discovery:
|
||||
await discovery.stop()
|
||||
await swarm.shutdown()
|
||||
|
||||
return True
|
||||
|
||||
try:
|
||||
success = asyncio.run(run_server())
|
||||
if success:
|
||||
print("\n" + "=" * 70)
|
||||
print("✅ Server stopped gracefully")
|
||||
print("=" * 70)
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error running server: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
# Run main mode
|
||||
try:
|
||||
return asyncio.run(run_main_mode(args, hardware))
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nReceived stop signal")
|
||||
return 0
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
sys.exit(main())
|
||||
|
||||
@@ -0,0 +1,760 @@
|
||||
"""Chat completion handlers for Local Swarm.
|
||||
|
||||
Contains the business logic for chat completions, separated from HTTP routing.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import uuid
|
||||
from typing import Optional, List
|
||||
|
||||
from api.models import (
|
||||
ChatCompletionRequest,
|
||||
ChatCompletionResponse,
|
||||
ChatCompletionChoice,
|
||||
ChatMessage,
|
||||
UsageInfo,
|
||||
)
|
||||
from api.formatting import format_messages_with_tools
|
||||
from api.tool_parser import parse_tool_calls
|
||||
from utils.token_counter import count_tokens
|
||||
from tools.executor import get_tool_executor
|
||||
from chatlog import get_chat_logger
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _extract_working_dir_from_prompt(prompt: str) -> Optional[str]:
|
||||
"""Extract working directory from user prompt.
|
||||
|
||||
Looks for patterns like:
|
||||
- "in the /path/to/dir directory"
|
||||
- "in directory /path/to/dir"
|
||||
- "in /path/to/dir"
|
||||
- "under /path/to/dir"
|
||||
- "from /path/to/dir"
|
||||
|
||||
Args:
|
||||
prompt: User prompt text
|
||||
|
||||
Returns:
|
||||
Extracted directory path or None
|
||||
"""
|
||||
import re
|
||||
import os
|
||||
|
||||
# Common patterns for directory mentions
|
||||
patterns = [
|
||||
r'in the\s+([/~]?[\w\-/.]+)\s+(?:directory|folder|dir)',
|
||||
r'in\s+(?:directory|folder|dir)\s+([/~]?[\w\-/.]+)',
|
||||
r'(?:in|under|from|at)\s+([/~]?[\w\-/.]{3,})', # At least 3 chars to avoid "in a"
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, prompt, re.IGNORECASE)
|
||||
if match:
|
||||
path = match.group(1)
|
||||
# Validate it looks like a path
|
||||
if path.startswith('/') or path.startswith('~') or '/' in path:
|
||||
# Expand home directory
|
||||
if path.startswith('~'):
|
||||
path = os.path.expanduser(path)
|
||||
# Check if it's a valid directory or parent exists
|
||||
if os.path.isdir(path) or os.path.isdir(os.path.dirname(path)):
|
||||
return os.path.abspath(path)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _sanitize_tools(tools: Optional[list]) -> Optional[list]:
|
||||
"""Sanitize tool definitions to fix invalid schemas.
|
||||
|
||||
Removes extra 'description' from properties if present.
|
||||
|
||||
Args:
|
||||
tools: List of tool definitions
|
||||
|
||||
Returns:
|
||||
Sanitized tools list
|
||||
"""
|
||||
if not tools:
|
||||
return tools
|
||||
|
||||
sanitized = []
|
||||
for tool in tools:
|
||||
if tool.type == "function" and tool.function.parameters:
|
||||
params = tool.function.parameters
|
||||
# Remove invalid 'description' from properties if present
|
||||
if 'properties' in params and 'description' in params.get('properties', {}):
|
||||
invalid_props = ['description']
|
||||
# Also remove 'description' from required if present
|
||||
if 'required' in params:
|
||||
params['required'] = [r for r in params.get('required', []) if r not in invalid_props]
|
||||
# Remove invalid properties
|
||||
params['properties'] = {k: v for k, v in params.get('properties', {}).items() if k not in invalid_props}
|
||||
logger.debug(f" 🔧 Sanitized tool '{tool.function.name}': removed {invalid_props} from properties/required")
|
||||
sanitized.append(tool)
|
||||
|
||||
return sanitized
|
||||
|
||||
|
||||
async def _execute_tools(
|
||||
tool_calls: list,
|
||||
client_working_dir: Optional[str],
|
||||
executor
|
||||
) -> List[tuple]:
|
||||
"""Execute tool calls and return results.
|
||||
|
||||
Args:
|
||||
tool_calls: List of parsed tool calls
|
||||
client_working_dir: Working directory for file operations
|
||||
executor: Tool executor instance
|
||||
|
||||
Returns:
|
||||
List of tuples (tool_name, result_string)
|
||||
"""
|
||||
from api.routes import execute_tool_server_side
|
||||
|
||||
tool_results = []
|
||||
for i, tc in enumerate(tool_calls):
|
||||
tool_name = tc.get("function", {}).get("name", "")
|
||||
tool_args_str = tc.get("function", {}).get("arguments", "{}")
|
||||
try:
|
||||
tool_args = json.loads(tool_args_str) if isinstance(tool_args_str, str) else tool_args_str
|
||||
except:
|
||||
tool_args = {}
|
||||
|
||||
logger.debug(f" [{i+1}/{len(tool_calls)}] Executing: {tool_name}({tool_args})")
|
||||
result = await execute_tool_server_side(tool_name, tool_args, working_dir=client_working_dir)
|
||||
tool_results.append((tool_name, result))
|
||||
logger.debug(f" ✓ Completed: {result[:100]}..." if len(result) > 100 else f" ✓ Result: {result}")
|
||||
|
||||
return tool_results
|
||||
|
||||
|
||||
def _create_response(
|
||||
content: str,
|
||||
tool_calls: list,
|
||||
finish_reason: str,
|
||||
prompt: str,
|
||||
request: ChatCompletionRequest,
|
||||
swarm_manager=None,
|
||||
thinking_content: Optional[str] = None
|
||||
) -> ChatCompletionResponse:
|
||||
"""Create a chat completion response.
|
||||
|
||||
Args:
|
||||
content: Final response content (after tool execution if any)
|
||||
tool_calls: List of tool calls
|
||||
finish_reason: Finish reason
|
||||
prompt: Original prompt for token counting
|
||||
request: Original request
|
||||
swarm_manager: Swarm manager instance (optional, for getting model name)
|
||||
thinking_content: Intermediate thinking/planning content to include in streaming as reasoning_content
|
||||
|
||||
Returns:
|
||||
ChatCompletionResponse
|
||||
"""
|
||||
"""Create a chat completion response.
|
||||
|
||||
Args:
|
||||
content: Response content
|
||||
tool_calls: List of tool calls
|
||||
finish_reason: Finish reason
|
||||
prompt: Original prompt for token counting
|
||||
request: Original request
|
||||
swarm_manager: Swarm manager instance (optional, for getting model name)
|
||||
|
||||
Returns:
|
||||
ChatCompletionResponse
|
||||
"""
|
||||
# Ensure content is at least an empty string (never None for OpenAI compatibility)
|
||||
if content is None:
|
||||
content = ""
|
||||
|
||||
prompt_tokens = count_tokens(prompt)
|
||||
completion_tokens = count_tokens(content)
|
||||
total_tokens = prompt_tokens + completion_tokens
|
||||
|
||||
# Get actual model name from swarm manager
|
||||
model_name = request.model
|
||||
system_fingerprint = None
|
||||
if swarm_manager:
|
||||
status = swarm_manager.get_status()
|
||||
model_name = status.model_name
|
||||
# Sanitize system_fingerprint to only include safe characters
|
||||
import re
|
||||
raw_fingerprint = model_name.lower().replace(" ", "-")
|
||||
system_fingerprint = re.sub(r'[^a-z0-9\-_]', '', raw_fingerprint)
|
||||
|
||||
# Build message - omit tool_calls entirely if empty (OpenAI behavior)
|
||||
message_kwargs = {
|
||||
"role": "assistant",
|
||||
"content": content
|
||||
}
|
||||
if tool_calls:
|
||||
message_kwargs["tool_calls"] = tool_calls
|
||||
|
||||
message = ChatMessage(**message_kwargs)
|
||||
|
||||
response = ChatCompletionResponse(
|
||||
id=f"chatcmpl-{uuid.uuid4().hex[:12]}",
|
||||
created=int(time.time()),
|
||||
model=model_name,
|
||||
choices=[
|
||||
ChatCompletionChoice(
|
||||
index=0,
|
||||
message=message,
|
||||
logprobs=None,
|
||||
finish_reason=finish_reason
|
||||
)
|
||||
],
|
||||
usage=UsageInfo(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=total_tokens
|
||||
),
|
||||
stats={},
|
||||
system_fingerprint=system_fingerprint
|
||||
)
|
||||
|
||||
# Attach thinking content for streaming (not part of JSON serialization)
|
||||
# Use a private attribute to avoid interfering with model serialization
|
||||
if thinking_content is not None:
|
||||
setattr(response, '_thinking', thinking_content)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
async def _generate_with_consensus(
|
||||
prompt: str,
|
||||
max_tokens: int,
|
||||
temperature: float,
|
||||
swarm_manager,
|
||||
federated_swarm=None
|
||||
) -> tuple[str, int, float]:
|
||||
"""Generate response with consensus (local or federated).
|
||||
|
||||
This is the unified generation interface - it handles both local-only
|
||||
and federated generation transparently. Callers don't need to know
|
||||
which mode is being used.
|
||||
|
||||
Args:
|
||||
prompt: Prompt to generate from
|
||||
max_tokens: Maximum tokens to generate
|
||||
temperature: Sampling temperature
|
||||
swarm_manager: Local swarm manager instance
|
||||
federated_swarm: Optional federated swarm for multi-node consensus
|
||||
|
||||
Returns:
|
||||
Tuple of (response_text, tokens_generated, tokens_per_second)
|
||||
"""
|
||||
# Check if federation is available
|
||||
if federated_swarm is not None:
|
||||
peers = federated_swarm.discovery.get_peers()
|
||||
if peers:
|
||||
logger.debug(f"🌐 Using federation with {len(peers)} peer(s)")
|
||||
try:
|
||||
fed_result = await federated_swarm.generate_with_federation(
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature
|
||||
)
|
||||
# Federation returns FederationResult object
|
||||
# Extract the final response text
|
||||
return fed_result.final_response, 0, 0.0 # Tokens/TPS not tracked in federation mode
|
||||
except Exception as e:
|
||||
logger.warning(f"Federation failed, falling back to local: {e}")
|
||||
# Fall through to local generation
|
||||
|
||||
# Local generation (fallback or no federation)
|
||||
try:
|
||||
result = await swarm_manager.generate(
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
use_consensus=True
|
||||
)
|
||||
response = result.selected_response
|
||||
return response.text, response.tokens_generated, response.tokens_per_second
|
||||
except Exception as e:
|
||||
logger.exception("Error in swarm generation")
|
||||
raise
|
||||
|
||||
|
||||
def _tool_calls_agree(tool_calls_list: List[List[dict]]) -> bool:
|
||||
"""Check if all workers agree on the same tool calls.
|
||||
|
||||
Args:
|
||||
tool_calls_list: List of tool calls from each worker
|
||||
|
||||
Returns:
|
||||
True if all workers have the same tool calls
|
||||
"""
|
||||
if not tool_calls_list:
|
||||
return True
|
||||
|
||||
# Check if all have the same number of tool calls
|
||||
first_count = len(tool_calls_list[0])
|
||||
if not all(len(tc) == first_count for tc in tool_calls_list):
|
||||
logger.warning(f" ⚠️ Workers disagree on number of tool calls: {[len(tc) for tc in tool_calls_list]}")
|
||||
return False
|
||||
|
||||
if first_count == 0:
|
||||
return True # All agree on no tools
|
||||
|
||||
# Check if tool names and arguments match
|
||||
for i in range(first_count):
|
||||
first_tool = tool_calls_list[0][i]
|
||||
first_name = first_tool.get("function", {}).get("name", "")
|
||||
first_args = first_tool.get("function", {}).get("arguments", "")
|
||||
|
||||
for j, other_calls in enumerate(tool_calls_list[1:], 1):
|
||||
other_tool = other_calls[i]
|
||||
other_name = other_tool.get("function", {}).get("name", "")
|
||||
other_args = other_tool.get("function", {}).get("arguments", "")
|
||||
|
||||
if first_name != other_name:
|
||||
logger.warning(f" ⚠️ Worker {j+1} disagrees on tool name: {first_name} vs {other_name}")
|
||||
return False
|
||||
|
||||
# For arguments, do a loose comparison (ignore whitespace differences)
|
||||
try:
|
||||
first_args_norm = json.loads(first_args) if isinstance(first_args, str) else first_args
|
||||
other_args_norm = json.loads(other_args) if isinstance(other_args, str) else other_args
|
||||
if first_args_norm != other_args_norm:
|
||||
logger.warning(f" ⚠️ Worker {j+1} disagrees on arguments for {first_name}")
|
||||
return False
|
||||
except json.JSONDecodeError:
|
||||
# If JSON parsing fails, compare as strings
|
||||
if str(first_args).strip() != str(other_args).strip():
|
||||
logger.warning(f" ⚠️ Worker {j+1} disagrees on arguments for {first_name}")
|
||||
return False
|
||||
|
||||
logger.info(f" ✅ All {len(tool_calls_list)} workers agree on tool calls")
|
||||
return True
|
||||
|
||||
|
||||
async def _generate_with_tool_consensus(
|
||||
swarm_manager,
|
||||
prompt: str,
|
||||
max_tokens: int,
|
||||
temperature: float
|
||||
) -> tuple[str, List[dict], int, float]:
|
||||
"""Generate response with tool call consensus checking.
|
||||
|
||||
When multiple workers are active, this ensures they all agree on tool calls
|
||||
before executing them. If they disagree, returns the best response without tools.
|
||||
|
||||
Args:
|
||||
swarm_manager: Swarm manager instance
|
||||
prompt: Prompt to generate from
|
||||
max_tokens: Maximum tokens to generate
|
||||
temperature: Sampling temperature
|
||||
|
||||
Returns:
|
||||
Tuple of (response_text, tool_calls, tokens_generated, tps)
|
||||
"""
|
||||
try:
|
||||
# Get status to check number of workers
|
||||
status = swarm_manager.get_status()
|
||||
num_workers = getattr(status, 'active_workers', 1)
|
||||
|
||||
# If only one worker, use normal generation
|
||||
if num_workers <= 1:
|
||||
logger.debug(" Single worker mode - skipping tool consensus")
|
||||
result = await swarm_manager.generate(
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
use_consensus=True
|
||||
)
|
||||
response = result.selected_response
|
||||
parsed_content, tool_calls = parse_tool_calls(response.text)
|
||||
return response.text, tool_calls, response.tokens_generated, response.tokens_per_second
|
||||
|
||||
# Multiple workers - check for tool consensus
|
||||
logger.info(f" 🔍 Checking tool consensus across {num_workers} workers...")
|
||||
|
||||
# Generate from all workers individually
|
||||
from swarm.manager import GenerationRequest
|
||||
all_responses = []
|
||||
all_tool_calls = []
|
||||
|
||||
# Get all active workers
|
||||
workers = swarm_manager.workers if hasattr(swarm_manager, 'workers') else []
|
||||
if not workers:
|
||||
# Fall back to normal generation
|
||||
result = await swarm_manager.generate(
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
use_consensus=True
|
||||
)
|
||||
response = result.selected_response
|
||||
parsed_content, tool_calls = parse_tool_calls(response.text)
|
||||
return response.text, tool_calls, response.tokens_generated, response.tokens_per_second
|
||||
|
||||
# Generate from each worker
|
||||
for i, worker in enumerate(workers):
|
||||
try:
|
||||
gen_result = await worker.generate(
|
||||
GenerationRequest(prompt=prompt, max_tokens=max_tokens, temperature=temperature)
|
||||
)
|
||||
response_text = gen_result.text
|
||||
parsed_content, tool_calls = parse_tool_calls(response_text)
|
||||
all_responses.append(response_text)
|
||||
all_tool_calls.append(tool_calls)
|
||||
logger.debug(f" Worker {i+1}: {len(tool_calls)} tool call(s)")
|
||||
except Exception as e:
|
||||
logger.warning(f" Worker {i+1} failed: {e}")
|
||||
all_responses.append("")
|
||||
all_tool_calls.append([])
|
||||
|
||||
# Check consensus
|
||||
if _tool_calls_agree(all_tool_calls):
|
||||
# All agree - use the first response's tool calls
|
||||
best_response = all_responses[0] if all_responses else ""
|
||||
best_tool_calls = all_tool_calls[0] if all_tool_calls else []
|
||||
total_tokens = sum(len(r.split()) for r in all_responses if r) // len([r for r in all_responses if r])
|
||||
avg_tps = 10.0 # Estimate
|
||||
return best_response, best_tool_calls, total_tokens, avg_tps
|
||||
else:
|
||||
# Disagreement - fall back to consensus strategy without tools
|
||||
logger.warning(" ⚠️ Tool consensus failed - falling back to text response")
|
||||
result = await swarm_manager.generate(
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
use_consensus=True
|
||||
)
|
||||
response = result.selected_response
|
||||
# Strip any tool calls to be safe
|
||||
parsed_content, _ = parse_tool_calls(response.text)
|
||||
return parsed_content, [], response.tokens_generated, response.tokens_per_second
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("Error in tool consensus generation")
|
||||
# Fall back to normal generation
|
||||
result = await swarm_manager.generate(
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
use_consensus=True
|
||||
)
|
||||
response = result.selected_response
|
||||
parsed_content, tool_calls = parse_tool_calls(response.text)
|
||||
return response.text, tool_calls, response.tokens_generated, response.tokens_per_second
|
||||
|
||||
|
||||
async def _generate_with_federation(
|
||||
federated_swarm,
|
||||
prompt: str,
|
||||
max_tokens: int,
|
||||
temperature: float
|
||||
) -> tuple[str, list, str]:
|
||||
"""Generate response using federated swarm.
|
||||
|
||||
Args:
|
||||
federated_swarm: Federated swarm instance
|
||||
prompt: Prompt to generate from
|
||||
max_tokens: Maximum tokens to generate
|
||||
temperature: Sampling temperature
|
||||
|
||||
Returns:
|
||||
Tuple of (response_text, tool_calls, finish_reason)
|
||||
"""
|
||||
result = await federated_swarm.generate_with_federation(
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
min_peers=0
|
||||
)
|
||||
|
||||
content = result.final_response or ""
|
||||
|
||||
# Check for tool calls
|
||||
content_parsed, tool_calls_parsed = parse_tool_calls(content)
|
||||
if tool_calls_parsed:
|
||||
return content_parsed or "", tool_calls_parsed, "tool_calls"
|
||||
|
||||
return content or "", [], "stop"
|
||||
|
||||
|
||||
async def handle_chat_completion(
|
||||
request: ChatCompletionRequest,
|
||||
swarm_manager,
|
||||
federated_swarm,
|
||||
client_working_dir: Optional[str],
|
||||
use_opencode_tools: bool
|
||||
) -> ChatCompletionResponse:
|
||||
"""Handle a chat completion request.
|
||||
|
||||
Args:
|
||||
request: Chat completion request
|
||||
swarm_manager: Swarm manager instance
|
||||
federated_swarm: Optional federated swarm instance
|
||||
client_working_dir: Client working directory
|
||||
use_opencode_tools: Whether to use opencode tool definitions
|
||||
|
||||
Returns:
|
||||
Chat completion response
|
||||
"""
|
||||
# Format messages into prompt
|
||||
if use_opencode_tools:
|
||||
sanitized_tools = _sanitize_tools(request.tools)
|
||||
prompt = format_messages_with_tools(request.messages, sanitized_tools)
|
||||
has_tools = sanitized_tools is not None and len(sanitized_tools) > 0
|
||||
else:
|
||||
prompt = format_messages_with_tools(request.messages, None)
|
||||
has_tools = request.tools is not None and len(request.tools) > 0
|
||||
|
||||
# Initialize chat logger (if enabled via LOCAL_SWARM_CHATLOG=1)
|
||||
chat_logger = get_chat_logger()
|
||||
|
||||
# Extract working directory from prompt if not provided by client
|
||||
if client_working_dir is None:
|
||||
# Try to extract from user messages
|
||||
for msg in reversed(request.messages):
|
||||
if msg.role == 'user':
|
||||
extracted_dir = _extract_working_dir_from_prompt(msg.content)
|
||||
if extracted_dir:
|
||||
client_working_dir = extracted_dir
|
||||
logger.info(f"📁 Extracted working directory from prompt: {client_working_dir}")
|
||||
break
|
||||
|
||||
# Log initial conversation history to chatlog
|
||||
for msg in request.messages:
|
||||
if msg.role == 'user':
|
||||
chat_logger.log_user_message(msg.content)
|
||||
elif msg.role == 'assistant':
|
||||
chat_logger.log_assistant_message(msg.content, has_tool_calls=bool(msg.tool_calls))
|
||||
elif msg.role == 'tool':
|
||||
chat_logger.log_tool_result("tool", msg.content)
|
||||
|
||||
logger.info(f"\n{'='*60}")
|
||||
logger.info(f"CHAT COMPLETION REQUEST:")
|
||||
logger.info(f" has_tools={has_tools}, stream={request.stream}")
|
||||
logger.info(f" use_opencode={use_opencode_tools}")
|
||||
logger.info(f" messages={len(request.messages)}")
|
||||
logger.info(f"{'='*60}")
|
||||
|
||||
# Build conversation history
|
||||
messages = list(request.messages)
|
||||
|
||||
# Determine if we should use federation for generation
|
||||
use_federation = federated_swarm is not None and len(federated_swarm.discovery.get_peers()) > 0
|
||||
if use_federation:
|
||||
logger.info(f"🌐 Federation available with peers")
|
||||
|
||||
# Track thinking content for streaming (OpenCode reasoning_content)
|
||||
thinking_content: Optional[str] = None
|
||||
thinking_captured = False
|
||||
|
||||
# Initialize iteration counter and response text
|
||||
iteration = 0
|
||||
max_iterations = 3
|
||||
response_text = ""
|
||||
|
||||
while iteration < max_iterations:
|
||||
iteration += 1
|
||||
logger.info(f"--- Tool Execution Iteration {iteration} ---")
|
||||
|
||||
# Generate response
|
||||
# IMPORTANT: Only use federation on FIRST iteration (initial planning)
|
||||
# Subsequent iterations process tool results which only head node has
|
||||
if iteration == 1 and use_federation:
|
||||
# First iteration: use federation for consensus on initial plan
|
||||
logger.info(f"🌐 Using federation for initial generation...")
|
||||
response_text, tokens_generated, tps = await _generate_with_consensus(
|
||||
prompt=prompt,
|
||||
max_tokens=request.max_tokens or 1024,
|
||||
temperature=request.temperature or 0.7,
|
||||
swarm_manager=swarm_manager,
|
||||
federated_swarm=federated_swarm
|
||||
)
|
||||
else:
|
||||
# Subsequent iterations: LOCAL ONLY
|
||||
# Peers don't have tool results from previous iterations
|
||||
# Using federation here would cause inconsistent context
|
||||
if iteration > 1:
|
||||
logger.debug(f"Using local generation (iteration {iteration}, tool context local only)")
|
||||
response_text, tokens_generated, tps = await _generate_with_consensus(
|
||||
prompt=prompt,
|
||||
max_tokens=request.max_tokens or 1024,
|
||||
temperature=request.temperature or 0.7,
|
||||
swarm_manager=swarm_manager,
|
||||
federated_swarm=None # Force local-only
|
||||
)
|
||||
|
||||
logger.info(f"Generated response ({len(response_text)} chars, {tokens_generated} tokens)")
|
||||
logger.debug(f"Response: {response_text[:200]}...")
|
||||
|
||||
# Check for tool calls
|
||||
parsed_content, tool_calls_parsed = parse_tool_calls(response_text)
|
||||
|
||||
# Log assistant response to chatlog
|
||||
chat_logger.log_assistant_message(response_text, has_tool_calls=bool(tool_calls_parsed))
|
||||
|
||||
if tool_calls_parsed:
|
||||
# Log each tool call
|
||||
for i, tc in enumerate(tool_calls_parsed, 1):
|
||||
tool_name = tc.get("function", {}).get("name", "")
|
||||
args_str = tc.get("function", {}).get("arguments", "{}")
|
||||
try:
|
||||
args_dict = json.loads(args_str) if isinstance(args_str, str) else args_str
|
||||
except json.JSONDecodeError:
|
||||
args_dict = {"raw": args_str}
|
||||
chat_logger.log_tool_call(tool_name, args_dict, i)
|
||||
|
||||
# Capture thinking for OpenCode streaming (first occurrence only)
|
||||
if not thinking_captured:
|
||||
# Use the parsed content (without tool calls) as the reasoning
|
||||
thinking_content = parsed_content or ""
|
||||
thinking_captured = True
|
||||
|
||||
if not tool_calls_parsed:
|
||||
# No more tools - this is the final answer
|
||||
logger.info(f"✅ Final answer (no tools) after {iteration} iteration(s)")
|
||||
return _create_response(parsed_content, [], "stop", prompt, request, swarm_manager, thinking_content)
|
||||
|
||||
# Tools detected - execute them
|
||||
logger.info(f"🔧 Found {len(tool_calls_parsed)} tool call(s)")
|
||||
for i, tc in enumerate(tool_calls_parsed):
|
||||
tool_name = tc.get("function", {}).get("name", "")
|
||||
args_str = tc.get("function", {}).get("arguments", "{}")
|
||||
logger.info(f" [{i+1}] {tool_name}: {args_str[:100]}...")
|
||||
|
||||
# Add assistant message to history with tool_calls (if any)
|
||||
# This preserves the tool call IDs for proper tool message association
|
||||
assistant_message = ChatMessage(
|
||||
role="assistant",
|
||||
content=response_text
|
||||
)
|
||||
if tool_calls_parsed:
|
||||
# Convert tool calls to proper ToolCall objects with IDs
|
||||
from api.models import ToolCall
|
||||
tc_objects = []
|
||||
for i, tc_dict in enumerate(tool_calls_parsed):
|
||||
tc_id = tc_dict.get("id", f"call_{i}")
|
||||
tc_objects.append(ToolCall(
|
||||
id=tc_id,
|
||||
type="function",
|
||||
function={
|
||||
"name": tc_dict["function"]["name"],
|
||||
"arguments": tc_dict["function"]["arguments"]
|
||||
}
|
||||
))
|
||||
assistant_message.tool_calls = tc_objects
|
||||
|
||||
messages.append(assistant_message)
|
||||
|
||||
# Execute all tools
|
||||
logger.info(f"⏱️ Executing tools...")
|
||||
tool_results = await _execute_tools(tool_calls_parsed, client_working_dir, get_tool_executor())
|
||||
|
||||
# Log tool results to chatlog (single combined log for debugging)
|
||||
combined_strings = [f"Tool {i+1} ({name}): {result}" for i, (name, result) in enumerate(tool_results)]
|
||||
chat_logger.log_tool_result("combined", "\n\n".join(combined_strings), success=True)
|
||||
|
||||
# Add tool result to history - one message per tool call with proper tool_call_id
|
||||
for i, ((tool_name, tool_result), tc) in enumerate(zip(tool_results, tool_calls_parsed)):
|
||||
tool_call_id = tc.get("id", f"call_{i}")
|
||||
|
||||
# Format the tool result message with explicit instructions
|
||||
# This tells the model exactly what to do with the result
|
||||
if tool_name == "read":
|
||||
instruction = "The file contents are shown above. READ THIS FILE CONTENT ALOUD to the user. Do not call additional tools."
|
||||
elif tool_name == "write":
|
||||
instruction = "The file has been successfully written. CONFIRM to the user that the file was created with the content shown above. Do not call additional tools."
|
||||
elif tool_name == "bash":
|
||||
# Check if this was a verification command (ls, grep) vs an action command
|
||||
if "ls" in tool_result.lower() or "grep" in tool_result.lower():
|
||||
instruction = "CRITICAL: The listing is shown above. If the user asked to READ a specific file and you can see it exists in this listing, you MUST immediately USE THE read TOOL NOW with the exact filename from the listing. Do not summarize first - READ THE FILE immediately. Use the filename exactly as shown (e.g., 'my-secret.log' not '/path/to/my-secret.log'). If the user asked to just CHECK what files exist (without reading), then summarize. If the requested file is NOT in the listing, tell the user it doesn't exist."
|
||||
else:
|
||||
instruction = "The command has been executed. SUMMARIZE the output above to answer the user's request. Do not call additional tools."
|
||||
else:
|
||||
instruction = "The tool has completed. Use the result shown above to answer the user's request. Do not call additional tools."
|
||||
|
||||
tool_message_content = (
|
||||
f"Tool Result ({tool_name}):\n"
|
||||
f"{tool_result}\n\n"
|
||||
f"INSTRUCTION: {instruction}"
|
||||
)
|
||||
|
||||
messages.append(ChatMessage(
|
||||
role="tool",
|
||||
content=tool_message_content,
|
||||
tool_call_id=tool_call_id,
|
||||
name=tool_name
|
||||
))
|
||||
|
||||
logger.info(f" ✓ Tool result {i+1} added to history (tool_call_id={tool_call_id}, name={tool_name})")
|
||||
|
||||
logger.info(f"✅ Tools executed ({len(tool_results)} results)")
|
||||
|
||||
# Continue loop - generate response with tool results
|
||||
logger.info(f"🔄 Generating response with tool results...")
|
||||
|
||||
# Format with tool results (but DON'T include tool instruction - model should just use results)
|
||||
next_prompt = format_messages_with_tools(messages, None if use_opencode_tools else request.tools)
|
||||
|
||||
logger.info(f"📤 Prompt sent to model after tool execution:")
|
||||
logger.info(f" Total tokens: {count_tokens(next_prompt)}")
|
||||
logger.info(f" Messages in history: {len(messages)}")
|
||||
for i, msg in enumerate(messages):
|
||||
logger.info(f" [{i}] {msg.role}: {msg.content[:100]}{'...' if len(msg.content) > 100 else ''}")
|
||||
if msg.tool_calls:
|
||||
for j, tc in enumerate(msg.tool_calls):
|
||||
logger.info(f" Tool call {j}: {tc.function.get('name')} ({tc.function.get('arguments')})")
|
||||
if msg.tool_call_id:
|
||||
logger.info(f" (tool_call_id: {msg.tool_call_id}, name: {msg.name})")
|
||||
logger.debug(f"Full prompt:\n{next_prompt[:1000]}...")
|
||||
|
||||
response_text, tokens_generated, tps = await _generate_with_consensus(
|
||||
prompt=next_prompt,
|
||||
max_tokens=request.max_tokens or 1024,
|
||||
temperature=request.temperature or 0.7,
|
||||
swarm_manager=swarm_manager,
|
||||
federated_swarm=None # Tool result processing is local-only
|
||||
)
|
||||
|
||||
logger.info(f"✅ Generated with tool results ({len(response_text)} chars, {tokens_generated} tokens)")
|
||||
logger.debug(f"Response: {response_text[:200]}...")
|
||||
|
||||
# Check for more tools in the new response
|
||||
parsed_content, tool_calls_parsed = parse_tool_calls(response_text)
|
||||
|
||||
# Log assistant response to chatlog
|
||||
chat_logger.log_assistant_message(response_text, has_tool_calls=bool(tool_calls_parsed))
|
||||
|
||||
if tool_calls_parsed:
|
||||
# Log each tool call
|
||||
for i, tc in enumerate(tool_calls_parsed, 1):
|
||||
tool_name = tc.get("function", {}).get("name", "")
|
||||
args_str = tc.get("function", {}).get("arguments", "{}")
|
||||
try:
|
||||
args_dict = json.loads(args_str) if isinstance(args_str, str) else args_str
|
||||
except json.JSONDecodeError:
|
||||
args_dict = {"raw": args_str}
|
||||
chat_logger.log_tool_call(tool_name, args_dict, i)
|
||||
|
||||
# Capture thinking if not already captured
|
||||
if not thinking_captured:
|
||||
thinking_content = parsed_content or ""
|
||||
thinking_captured = True
|
||||
|
||||
if not tool_calls_parsed:
|
||||
# No more tools - final answer
|
||||
logger.info(f"✅ Final answer (after tool execution) after {iteration} iteration(s)")
|
||||
return _create_response(parsed_content, [], "stop", prompt, request, swarm_manager, thinking_content)
|
||||
|
||||
# More tools detected - continue loop
|
||||
logger.info(f"🔧 More tools found - continuing loop")
|
||||
|
||||
# Max iterations reached - force return last response
|
||||
logger.warning(f"⚠️ Max tool iterations ({max_iterations}) reached")
|
||||
logger.warning(f"⚠️ Returning last response (may include incomplete tool call)")
|
||||
return _create_response(response_text, [], "stop", prompt, request, swarm_manager, thinking_content)
|
||||
@@ -0,0 +1,271 @@
|
||||
"""Message formatting module for Local Swarm.
|
||||
|
||||
Formats chat messages into prompts and handles tool instructions.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, List
|
||||
|
||||
from api.models import ChatMessage
|
||||
from utils.token_counter import count_tokens
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Cache for tool instructions (loaded from config file)
|
||||
_TOOL_INSTRUCTIONS_CACHE: Optional[str] = None
|
||||
|
||||
# Global flag for tool mode (default: local tool server to save tokens)
|
||||
_USE_OPENCODE_TOOLS: bool = False
|
||||
|
||||
|
||||
def set_use_opencode_tools(value: bool) -> None:
|
||||
"""Set whether to use opencode's tool definitions (default: False = local tool server).
|
||||
|
||||
Args:
|
||||
value: True to use opencode tools (~27k tokens), False to use local tool server (~125 tokens)
|
||||
"""
|
||||
global _USE_OPENCODE_TOOLS
|
||||
_USE_OPENCODE_TOOLS = value
|
||||
logger.info(f"🔧 Tool mode set to: {'opencode tools (~27k tokens)' if value else 'local tool server (~125 tokens)'}")
|
||||
|
||||
|
||||
def _load_tool_instructions() -> str:
|
||||
"""Load tool instructions from config file.
|
||||
|
||||
Loads from config/prompts/tool_instructions.txt
|
||||
Falls back to default if file not found.
|
||||
|
||||
Returns:
|
||||
Tool instructions string
|
||||
"""
|
||||
global _TOOL_INSTRUCTIONS_CACHE
|
||||
|
||||
if _TOOL_INSTRUCTIONS_CACHE is not None:
|
||||
return _TOOL_INSTRUCTIONS_CACHE
|
||||
|
||||
# Try to load from config file
|
||||
config_path = Path(__file__).parent.parent.parent / "config" / "prompts" / "tool_instructions.txt"
|
||||
|
||||
try:
|
||||
if config_path.exists():
|
||||
with open(config_path, 'r') as f:
|
||||
_TOOL_INSTRUCTIONS_CACHE = f.read().strip()
|
||||
logger.debug(f"Loaded tool instructions from {config_path}")
|
||||
else:
|
||||
# Fallback default instructions
|
||||
_TOOL_INSTRUCTIONS_CACHE = """You MUST use tools. DO NOT explain. DO NOT use markdown.
|
||||
|
||||
OUTPUT THIS EXACT FORMAT - NOTHING ELSE:
|
||||
|
||||
TOOL: bash
|
||||
ARGUMENTS: {"command": "your command here"}
|
||||
|
||||
Available tools:
|
||||
- bash: Run shell commands
|
||||
- write: Create files
|
||||
- read: Read files
|
||||
|
||||
NEVER write explanations.
|
||||
NEVER use numbered lists.
|
||||
NEVER use markdown code blocks.
|
||||
ONLY output TOOL: lines."""
|
||||
logger.warning(f"Tool instructions config not found at {config_path}, using default")
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading tool instructions: {e}")
|
||||
# Use minimal fallback
|
||||
_TOOL_INSTRUCTIONS_CACHE = 'Use TOOL: tool_name\nARGUMENTS: {"param": "value"} format.'
|
||||
|
||||
return _TOOL_INSTRUCTIONS_CACHE
|
||||
|
||||
|
||||
def _is_initial_request(messages: List[ChatMessage]) -> bool:
|
||||
"""Check if this is an initial request (no assistant or tool messages).
|
||||
|
||||
Args:
|
||||
messages: List of chat messages
|
||||
|
||||
Returns:
|
||||
True if this is the initial request
|
||||
"""
|
||||
has_assistant = any(msg.role == "assistant" for msg in messages)
|
||||
has_tool = any(msg.role == "tool" for msg in messages)
|
||||
return not has_assistant and not has_tool
|
||||
|
||||
|
||||
def _compress_large_request(messages: List[ChatMessage], max_tokens: int = 4000) -> List[ChatMessage]:
|
||||
"""Compress large initial requests by keeping only user messages.
|
||||
|
||||
Args:
|
||||
messages: List of chat messages
|
||||
max_tokens: Maximum tokens before compression
|
||||
|
||||
Returns:
|
||||
Compressed list of messages
|
||||
"""
|
||||
full_text = "\n".join([f"{msg.role}: {msg.content}" for msg in messages])
|
||||
current_tokens = count_tokens(full_text)
|
||||
|
||||
if current_tokens <= max_tokens:
|
||||
return messages
|
||||
|
||||
logger.info(f"🗜️ COMPRESSING: Initial request is {current_tokens} tokens, compressing to <{max_tokens}...")
|
||||
|
||||
# Keep only user messages
|
||||
user_messages = [msg for msg in messages if msg.role == "user"]
|
||||
|
||||
if not user_messages:
|
||||
logger.warning("No user messages found in initial request!")
|
||||
return []
|
||||
|
||||
# Get the last user message
|
||||
last_user_msg = user_messages[-1]
|
||||
user_content = last_user_msg.content
|
||||
|
||||
# Truncate if still too long
|
||||
if len(user_content) > 2000:
|
||||
user_content = user_content[:2000] + "... [truncated for token limit]"
|
||||
logger.debug(f"Truncated user message from {len(last_user_msg.content)} to 2000 chars")
|
||||
|
||||
return [ChatMessage(role="user", content=user_content)]
|
||||
|
||||
|
||||
def _filter_messages(messages: List[ChatMessage]) -> List[ChatMessage]:
|
||||
"""Filter messages for processing.
|
||||
|
||||
For initial requests >4000 tokens, compress aggressively.
|
||||
Otherwise, just remove system messages.
|
||||
|
||||
Args:
|
||||
messages: List of chat messages
|
||||
|
||||
Returns:
|
||||
Filtered list of messages
|
||||
"""
|
||||
if _is_initial_request(messages):
|
||||
full_text = "\n".join([f"{msg.role}: {msg.content}" for msg in messages])
|
||||
if count_tokens(full_text) > 4000:
|
||||
return _compress_large_request(messages)
|
||||
|
||||
# Normal filtering: remove system messages
|
||||
return [msg for msg in messages if msg.role != "system"]
|
||||
|
||||
|
||||
def _add_tool_instructions(messages: List[ChatMessage]) -> List[ChatMessage]:
|
||||
"""Add tool instructions to the beginning of messages.
|
||||
|
||||
Tool instructions are now ALWAYS injected by default so any client
|
||||
(Continue, hollama, etc.) can use tools without requiring client-side
|
||||
tool instruction injection.
|
||||
|
||||
TODO: Add a "plan mode" that disables tool use for planning-only conversations.
|
||||
|
||||
Args:
|
||||
messages: List of chat messages
|
||||
|
||||
Returns:
|
||||
Messages with tool instructions added
|
||||
"""
|
||||
tool_instructions = _load_tool_instructions()
|
||||
logger.debug(f"Injecting tool instructions: {len(tool_instructions)} chars")
|
||||
|
||||
# Check if instructions already present (avoid duplication)
|
||||
if messages and messages[0].role == "system" and "AVAILABLE TOOLS" in messages[0].content:
|
||||
logger.debug("Tool instructions already present, skipping injection")
|
||||
return messages
|
||||
|
||||
return [ChatMessage(role="system", content=tool_instructions)] + messages
|
||||
|
||||
|
||||
def _format_to_chatml(messages: List[ChatMessage]) -> str:
|
||||
"""Format messages to ChatML format.
|
||||
|
||||
Args:
|
||||
messages: List of chat messages
|
||||
|
||||
Returns:
|
||||
ChatML formatted string
|
||||
"""
|
||||
formatted = []
|
||||
|
||||
for msg in messages:
|
||||
role = msg.role
|
||||
content = msg.content
|
||||
|
||||
if role == "system":
|
||||
formatted.append(f"<|im_start|>system\n{content}<|im_end|>")
|
||||
elif role == "user":
|
||||
formatted.append(f"<|im_start|>user\n{content}<|im_end|>")
|
||||
elif role == "assistant":
|
||||
formatted.append(f"<|im_start|>assistant\n{content}<|im_end|>")
|
||||
elif role == "tool":
|
||||
tool_name = getattr(msg, 'name', 'tool')
|
||||
formatted.append(f"<|im_start|>tool\n{tool_name}: {content}<|im_end|>")
|
||||
|
||||
formatted.append("<|im_start|>assistant\n")
|
||||
return "\n".join(formatted)
|
||||
|
||||
|
||||
def _log_prompt_preview(messages: List[ChatMessage]) -> None:
|
||||
"""Log a preview of the prompt for debugging.
|
||||
|
||||
Args:
|
||||
messages: List of chat messages
|
||||
"""
|
||||
preview = []
|
||||
for msg in messages:
|
||||
if msg.role == "system":
|
||||
preview.append(f"[SYSTEM] {msg.content[:200]}...")
|
||||
elif msg.role == "user":
|
||||
preview.append(f"[USER] {msg.content}")
|
||||
logger.debug(f"Prompt preview: {' | '.join(preview)}")
|
||||
|
||||
|
||||
def format_messages_with_tools(
|
||||
messages: List[ChatMessage],
|
||||
tools: Optional[list] = None
|
||||
) -> str:
|
||||
"""Format chat messages into a single prompt using ChatML format.
|
||||
|
||||
Note: Tools are handled server-side. The model should respond normally.
|
||||
IMPORTANT: If _USE_OPENCODE_TOOLS is True, use opencode's tool definitions (~27k tokens).
|
||||
If False, use local tool server (~125 tokens) to save tokens.
|
||||
|
||||
Args:
|
||||
messages: List of chat messages
|
||||
tools: Optional list of tools (currently ignored, server-side handling)
|
||||
|
||||
Returns:
|
||||
Formatted prompt string in ChatML format
|
||||
"""
|
||||
# Filter messages
|
||||
filtered_messages = _filter_messages(messages)
|
||||
|
||||
# Add tool instructions if needed
|
||||
filtered_messages = _add_tool_instructions(filtered_messages)
|
||||
|
||||
# Log preview
|
||||
_log_prompt_preview(filtered_messages)
|
||||
|
||||
# Format to ChatML
|
||||
result = _format_to_chatml(filtered_messages)
|
||||
|
||||
# Log final token count
|
||||
final_tokens = count_tokens(result)
|
||||
original_tokens = count_tokens("\n".join([f"{msg.role}: {msg.content}" for msg in messages]))
|
||||
logger.info(f"📊 Final prompt size: {final_tokens} tokens (reduced from {original_tokens})")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def format_messages(messages: List[ChatMessage]) -> str:
|
||||
"""Format chat messages into a single prompt using ChatML format.
|
||||
|
||||
Args:
|
||||
messages: List of chat messages
|
||||
|
||||
Returns:
|
||||
Formatted prompt string
|
||||
"""
|
||||
return format_messages_with_tools(messages, None)
|
||||
+18
-11
@@ -4,7 +4,7 @@ Pydantic models matching OpenAI's API specification.
|
||||
"""
|
||||
|
||||
from typing import List, Optional, Literal, Dict, Any, Union
|
||||
from pydantic import BaseModel, Field
|
||||
from pydantic import BaseModel, Field, ConfigDict
|
||||
|
||||
|
||||
class FunctionDefinition(BaseModel):
|
||||
@@ -30,13 +30,15 @@ class ToolCall(BaseModel):
|
||||
class ChatMessage(BaseModel):
|
||||
"""A chat message."""
|
||||
role: Literal["system", "user", "assistant", "tool"] = Field(..., description="Role of the message sender")
|
||||
content: Optional[str] = Field(default=None, description="Message content")
|
||||
tool_calls: Optional[List[ToolCall]] = Field(default_factory=list, description="Tool calls from assistant")
|
||||
#tool_call_id: Optional[str] = Field(default=None, description="ID of tool call this message is responding to")
|
||||
#name: Optional[str] = Field(default=None, description="Name of the tool/function")
|
||||
|
||||
class Config:
|
||||
exclude_none = True
|
||||
content: str = Field(default="", description="Message content")
|
||||
tool_calls: Optional[List[ToolCall]] = Field(default=None, description="Tool calls from assistant")
|
||||
tool_call_id: Optional[str] = Field(default=None, description="ID of tool call this message is responding to")
|
||||
name: Optional[str] = Field(default=None, description="Name of the tool/function")
|
||||
|
||||
model_config = ConfigDict(
|
||||
# Use Pydantic's exclude_none to omit tool_calls when None
|
||||
exclude_none=True
|
||||
)
|
||||
|
||||
|
||||
class ChatCompletionRequest(BaseModel):
|
||||
@@ -50,9 +52,9 @@ class ChatCompletionRequest(BaseModel):
|
||||
stop: Optional[List[str]] = Field(default=None, description="Stop sequences")
|
||||
tools: Optional[List[Tool]] = Field(default=None, description="List of tools the model may call")
|
||||
tool_choice: Optional[Union[str, Dict[str, Any]]] = Field(default="auto", description="How to choose tools")
|
||||
|
||||
class Config:
|
||||
json_schema_extra = {
|
||||
|
||||
model_config = ConfigDict(
|
||||
json_schema_extra={
|
||||
"example": {
|
||||
"model": "local-swarm",
|
||||
"messages": [
|
||||
@@ -62,12 +64,14 @@ class ChatCompletionRequest(BaseModel):
|
||||
"temperature": 0.7
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class ChatCompletionChoice(BaseModel):
|
||||
"""A choice in the chat completion response."""
|
||||
index: int = Field(..., description="Choice index")
|
||||
message: ChatMessage = Field(..., description="Generated message")
|
||||
logprobs: Optional[Any] = Field(default=None, description="Log probabilities")
|
||||
finish_reason: Optional[str] = Field(default="stop", description="Reason for finishing (stop, length, tool_calls, etc.)")
|
||||
|
||||
|
||||
@@ -76,6 +80,7 @@ class UsageInfo(BaseModel):
|
||||
prompt_tokens: int = Field(default=0, description="Tokens in prompt")
|
||||
completion_tokens: int = Field(default=0, description="Tokens in completion")
|
||||
total_tokens: int = Field(default=0, description="Total tokens")
|
||||
tokens_per_second: Optional[float] = Field(default=None, description="Generation speed in tokens per second")
|
||||
|
||||
|
||||
class ChatCompletionResponse(BaseModel):
|
||||
@@ -86,6 +91,8 @@ class ChatCompletionResponse(BaseModel):
|
||||
model: str = Field(..., description="Model used")
|
||||
choices: List[ChatCompletionChoice] = Field(..., description="Generated choices")
|
||||
usage: UsageInfo = Field(..., description="Token usage")
|
||||
stats: Dict[str, Any] = Field(default_factory=dict, description="Additional stats")
|
||||
system_fingerprint: Optional[str] = Field(default=None, description="System fingerprint")
|
||||
|
||||
|
||||
class ChatCompletionStreamChoice(BaseModel):
|
||||
|
||||
+324
-840
File diff suppressed because it is too large
Load Diff
+20
-11
@@ -18,21 +18,23 @@ from swarm.status_monitor import StatusMonitor
|
||||
|
||||
class APIServer:
|
||||
"""OpenAI-compatible API server."""
|
||||
|
||||
def __init__(self, swarm_manager: SwarmManager, host: str = "127.0.0.1", port: int = 17615, show_live_status: bool = True):
|
||||
|
||||
def __init__(self, swarm_manager: SwarmManager, host: str = "127.0.0.1", port: int = 17615, show_live_status: bool = True, use_opencode_tools: bool = False):
|
||||
"""
|
||||
Initialize API server.
|
||||
|
||||
|
||||
Args:
|
||||
swarm_manager: Swarm manager instance
|
||||
host: Host to bind to
|
||||
port: Port to listen on
|
||||
show_live_status: Whether to show live worker status updates
|
||||
use_opencode_tools: Whether to use opencode's tool definitions (~27k tokens) or local tool server (~125 tokens)
|
||||
"""
|
||||
self.swarm_manager = swarm_manager
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.show_live_status = show_live_status
|
||||
self.use_opencode_tools = use_opencode_tools
|
||||
self.status_monitor: Optional[StatusMonitor] = None
|
||||
self.app = self._create_app()
|
||||
|
||||
@@ -42,8 +44,12 @@ class APIServer:
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Lifespan context manager for startup/shutdown."""
|
||||
# Startup: Set swarm manager in routes
|
||||
# Startup: Set swarm manager in routes and app state
|
||||
set_swarm_manager(self.swarm_manager)
|
||||
app.state.swarm_manager = self.swarm_manager # For federation endpoint
|
||||
# Set tool mode in routes
|
||||
from api.routes import set_use_opencode_tools
|
||||
set_use_opencode_tools(self.use_opencode_tools)
|
||||
print(f"\n🌐 API server starting on http://{self.host}:{self.port}")
|
||||
print(f" Endpoints:")
|
||||
print(f" - POST /v1/chat/completions")
|
||||
@@ -90,32 +96,35 @@ class APIServer:
|
||||
self.app,
|
||||
host=self.host,
|
||||
port=self.port,
|
||||
log_level="info"
|
||||
log_level="warning",
|
||||
access_log=False
|
||||
)
|
||||
server = uvicorn.Server(config)
|
||||
await server.serve()
|
||||
|
||||
|
||||
def run_sync(self):
|
||||
"""Run server synchronously (blocking)."""
|
||||
uvicorn.run(
|
||||
self.app,
|
||||
host=self.host,
|
||||
port=self.port,
|
||||
log_level="info"
|
||||
log_level="warning",
|
||||
access_log=False
|
||||
)
|
||||
|
||||
|
||||
def create_server(swarm_manager: SwarmManager, host: str = "127.0.0.1", port: int = 17615, show_live_status: bool = True) -> APIServer:
|
||||
def create_server(swarm_manager: SwarmManager, host: str = "127.0.0.1", port: int = 17615, show_live_status: bool = True, use_opencode_tools: bool = False) -> APIServer:
|
||||
"""
|
||||
Create API server instance.
|
||||
|
||||
|
||||
Args:
|
||||
swarm_manager: Swarm manager instance
|
||||
host: Host to bind to
|
||||
port: Port to listen on
|
||||
show_live_status: Whether to show live worker status updates
|
||||
|
||||
use_opencode_tools: Whether to use opencode's tool definitions (~27k tokens) or local tool server (~125 tokens)
|
||||
|
||||
Returns:
|
||||
APIServer instance
|
||||
"""
|
||||
return APIServer(swarm_manager, host, port, show_live_status)
|
||||
return APIServer(swarm_manager, host, port, show_live_status, use_opencode_tools)
|
||||
|
||||
@@ -0,0 +1,250 @@
|
||||
"""Tool parsing module for Local Swarm.
|
||||
|
||||
Parses tool calls from model output in various formats.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Tuple, Optional, List, Dict, Any
|
||||
|
||||
|
||||
def ensure_tool_arguments(tool_name: str, args_dict: dict) -> dict:
|
||||
"""Ensure tool arguments have all required fields.
|
||||
|
||||
For bash tool: inject 'description' field if missing.
|
||||
|
||||
Args:
|
||||
tool_name: Name of the tool
|
||||
args_dict: Tool arguments dictionary
|
||||
|
||||
Returns:
|
||||
Updated arguments dictionary
|
||||
"""
|
||||
if tool_name == 'bash' and 'description' not in args_dict:
|
||||
# Generate description from command
|
||||
command = args_dict.get('command', '')
|
||||
desc = command.split()[0] if command else 'Execute command'
|
||||
args_dict['description'] = desc
|
||||
return args_dict
|
||||
|
||||
|
||||
def _parse_standard_format(text: str) -> Tuple[Optional[str], Optional[List[Dict[str, Any]]]]:
|
||||
"""Parse standard TOOL: format.
|
||||
|
||||
Format: TOOL: name\nARGUMENTS: {"key": "value"}
|
||||
|
||||
Args:
|
||||
text: Model output text
|
||||
|
||||
Returns:
|
||||
Tuple of (content_without_tools, tool_calls) or (None, None) if not found
|
||||
"""
|
||||
tool_pattern = r'TOOL:\s*(\w+)\s*\nARGUMENTS:\s*(\{[^}]*\})'
|
||||
tool_matches = list(re.finditer(tool_pattern, text, re.IGNORECASE))
|
||||
|
||||
if not tool_matches:
|
||||
return None, None
|
||||
|
||||
tool_calls = []
|
||||
for i, tool_match in enumerate(tool_matches):
|
||||
tool_name = tool_match.group(1)
|
||||
args_str = tool_match.group(2)
|
||||
try:
|
||||
args_dict = json.loads(args_str)
|
||||
args_dict = ensure_tool_arguments(tool_name, args_dict)
|
||||
tool_calls.append({
|
||||
"id": f"call_{i+1}",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": tool_name,
|
||||
"arguments": json.dumps(args_dict)
|
||||
}
|
||||
})
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
if tool_calls:
|
||||
first_start = tool_matches[0].start()
|
||||
content = text[:first_start].strip()
|
||||
return content, tool_calls
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def _parse_markdown_format(text: str) -> Tuple[Optional[str], Optional[List[Dict[str, Any]]]]:
|
||||
"""Parse markdown code block format.
|
||||
|
||||
Format: ```bash command```
|
||||
|
||||
Args:
|
||||
text: Model output text
|
||||
|
||||
Returns:
|
||||
Tuple of (content_without_tools, tool_calls) or (None, None) if not found
|
||||
"""
|
||||
markdown_pattern = r'```(?:bash|shell|sh)?\s*\n(.*?)\n```'
|
||||
markdown_matches = list(re.finditer(markdown_pattern, text, re.DOTALL))
|
||||
|
||||
if not markdown_matches:
|
||||
return None, None
|
||||
|
||||
tool_calls = []
|
||||
for i, match in enumerate(markdown_matches):
|
||||
code_content = match.group(1).strip()
|
||||
if code_content:
|
||||
args_dict = {"command": code_content}
|
||||
args_dict = ensure_tool_arguments("bash", args_dict)
|
||||
tool_calls.append({
|
||||
"id": f"call_{i+1}",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "bash",
|
||||
"arguments": json.dumps(args_dict)
|
||||
}
|
||||
})
|
||||
|
||||
if tool_calls:
|
||||
first_start = markdown_matches[0].start()
|
||||
content = text[:first_start].strip()
|
||||
return content, tool_calls
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def _parse_command_lines(text: str) -> Tuple[Optional[str], Optional[List[Dict[str, Any]]]]:
|
||||
"""Parse command lines in text.
|
||||
|
||||
Matches common bash commands with their arguments.
|
||||
|
||||
Args:
|
||||
text: Model output text
|
||||
|
||||
Returns:
|
||||
Tuple of (content_without_tools, tool_calls) or (None, None) if not found
|
||||
"""
|
||||
command_lines = []
|
||||
command_pattern = r'^(npm|npx|mkdir|cd|ls|cat|echo|git|python|pip|node|yarn|create-react-app)\s+'
|
||||
|
||||
for line in text.split('\n'):
|
||||
line = line.strip()
|
||||
if re.match(command_pattern, line):
|
||||
command_lines.append(line)
|
||||
|
||||
if command_lines:
|
||||
combined_command = ' && '.join(command_lines)
|
||||
args_dict = {"command": combined_command}
|
||||
args_dict = ensure_tool_arguments("bash", args_dict)
|
||||
return "", [{
|
||||
"id": "call_1",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "bash",
|
||||
"arguments": json.dumps(args_dict)
|
||||
}
|
||||
}]
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def _parse_standalone_commands(text: str) -> Tuple[Optional[str], Optional[List[Dict[str, Any]]]]:
|
||||
"""Parse standalone bash commands.
|
||||
|
||||
Args:
|
||||
text: Model output text
|
||||
|
||||
Returns:
|
||||
Tuple of (content_without_tools, tool_calls) or (None, None) if not found
|
||||
"""
|
||||
standalone_pattern = r'(?:^|\n)(npm\s+\w+|npx\s+\w+|mkdir\s+\w+|cd\s+\w+|git\s+\w+)(?:\s|$)'
|
||||
standalone_matches = list(re.finditer(standalone_pattern, text, re.MULTILINE))
|
||||
|
||||
if standalone_matches:
|
||||
commands = [match.group(1).strip() for match in standalone_matches]
|
||||
if commands:
|
||||
combined_command = ' && '.join(commands)
|
||||
args_dict = {"command": combined_command}
|
||||
args_dict = ensure_tool_arguments("bash", args_dict)
|
||||
return "", [{
|
||||
"id": "call_1",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "bash",
|
||||
"arguments": json.dumps(args_dict)
|
||||
}
|
||||
}]
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def _parse_urls(text: str) -> Tuple[Optional[str], Optional[List[Dict[str, Any]]]]:
|
||||
"""Parse URLs for webfetch tool.
|
||||
|
||||
Args:
|
||||
text: Model output text
|
||||
|
||||
Returns:
|
||||
Tuple of (content_without_tools, tool_calls) or (None, None) if not found
|
||||
"""
|
||||
url_pattern = r'https?://[^\s<>"\')\]]+[a-zA-Z0-9]'
|
||||
url_matches = list(re.finditer(url_pattern, text))
|
||||
|
||||
if url_matches:
|
||||
urls = [match.group(0) for match in url_matches]
|
||||
if urls:
|
||||
tool_calls = []
|
||||
for i, url in enumerate(urls):
|
||||
tool_calls.append({
|
||||
"id": f"call_{i+1}",
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "webfetch",
|
||||
"arguments": json.dumps({"url": url, "format": "markdown"})
|
||||
}
|
||||
})
|
||||
return "", tool_calls
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def parse_tool_calls(text: str) -> Tuple[str, Optional[List[Dict[str, Any]]]]:
|
||||
"""Parse tool calls from model output using multiple formats.
|
||||
|
||||
Supports multiple formats for compatibility with different model sizes:
|
||||
1. Standard: TOOL: name\nARGUMENTS: {"key": "value"}
|
||||
2. Markdown: ```bash command```
|
||||
3. Command lines: npm install, git clone, etc.
|
||||
4. Standalone commands
|
||||
5. URLs: for webfetch tool
|
||||
|
||||
Args:
|
||||
text: Model output text
|
||||
|
||||
Returns:
|
||||
Tuple of (content_without_tools, tool_calls or None)
|
||||
"""
|
||||
# Priority 1: Standard format
|
||||
result = _parse_standard_format(text)
|
||||
if result[1] is not None:
|
||||
return result[0] or "", result[1]
|
||||
|
||||
# Priority 2: Markdown code blocks
|
||||
result = _parse_markdown_format(text)
|
||||
if result[1] is not None:
|
||||
return result[0] or "", result[1]
|
||||
|
||||
# Priority 3: Command lines
|
||||
result = _parse_command_lines(text)
|
||||
if result[1] is not None:
|
||||
return result[0] or "", result[1]
|
||||
|
||||
# Priority 4: Standalone commands
|
||||
result = _parse_standalone_commands(text)
|
||||
if result[1] is not None:
|
||||
return result[0] or "", result[1]
|
||||
|
||||
# Priority 5: URLs
|
||||
result = _parse_urls(text)
|
||||
if result[1] is not None:
|
||||
return result[0] or "", result[1]
|
||||
|
||||
return text, None
|
||||
@@ -4,7 +4,7 @@ Creates the appropriate backend based on hardware and platform.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
from hardware.detector import HardwareProfile, detect_hardware
|
||||
from hardware.detector import HardwareProfile, detect_hardware, calculate_gpu_layers
|
||||
from backends.base import LLMBackend
|
||||
from backends.llamacpp import LlamaCppBackend
|
||||
from backends.mlx import MLXBackend
|
||||
@@ -31,15 +31,17 @@ def create_backend(hardware: Optional[HardwareProfile] = None) -> LLMBackend:
|
||||
# Otherwise use llama.cpp (supports CUDA, ROCm, SYCL, CPU)
|
||||
print("Using llama.cpp backend")
|
||||
|
||||
# Determine GPU layers
|
||||
# Auto-configure GPU layers based on hardware
|
||||
n_gpu_layers = calculate_gpu_layers(hardware.gpu)
|
||||
|
||||
if hardware.gpu and not hardware.is_apple_silicon:
|
||||
# Has external GPU, offload all layers
|
||||
n_gpu_layers = -1
|
||||
print(f" GPU detected: {hardware.gpu.name}")
|
||||
print(f" Offloading all layers to GPU")
|
||||
if hardware.gpu.is_nvidia:
|
||||
print(f" Compute capability: {hardware.gpu.compute_capability or 'unknown'}")
|
||||
if hardware.gpu.device_count > 1:
|
||||
print(f" GPU count: {hardware.gpu.device_count}")
|
||||
print(f" Offloading {n_gpu_layers} layers to GPU")
|
||||
else:
|
||||
# CPU only
|
||||
n_gpu_layers = 0
|
||||
print(f" No GPU detected, using CPU")
|
||||
|
||||
return LlamaCppBackend(n_gpu_layers=n_gpu_layers)
|
||||
|
||||
@@ -0,0 +1,97 @@
|
||||
"""Chatlog for debugging tool execution.
|
||||
|
||||
Writes a human-readable markdown log of tool calls and results.
|
||||
Enabled by setting LOCAL_SWARM_CHATLOG=1 environment variable.
|
||||
Log file defaults to 'chatlog.md' in the current working directory.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class ChatLogger:
|
||||
"""Logs chat interactions and tool execution in opencode-style format."""
|
||||
|
||||
def __init__(self, log_path: Optional[str] = None):
|
||||
self.log_path = log_path or os.getenv('LOCAL_SWARM_CHATLOG_PATH', 'chatlog.md')
|
||||
self.enabled = os.getenv('LOCAL_SWARM_CHATLOG', '0') == '1'
|
||||
if self.enabled:
|
||||
self._initialize_log()
|
||||
|
||||
def _initialize_log(self):
|
||||
"""Create log file with header if it doesn't exist."""
|
||||
dir_path = os.path.dirname(self.log_path) or '.'
|
||||
os.makedirs(dir_path, exist_ok=True)
|
||||
with open(self.log_path, 'a') as f:
|
||||
f.write(f"\n\n# Local Swarm Session - {datetime.now().isoformat()}\n\n")
|
||||
|
||||
def _timestamp(self) -> str:
|
||||
"""Get current timestamp."""
|
||||
return datetime.now().strftime("%H:%M:%S")
|
||||
|
||||
def log_user_message(self, content: str):
|
||||
"""Log a user message."""
|
||||
if not self.enabled:
|
||||
return
|
||||
with open(self.log_path, 'a') as f:
|
||||
f.write(f"\n## [{self._timestamp()}] User\n\n")
|
||||
f.write(f"{content}\n\n")
|
||||
|
||||
def log_assistant_message(self, content: str, has_tool_calls: bool = False):
|
||||
"""Log an assistant response."""
|
||||
if not self.enabled:
|
||||
return
|
||||
with open(self.log_path, 'a') as f:
|
||||
f.write(f"\n## [{self._timestamp()}] Assistant\n\n")
|
||||
if has_tool_calls:
|
||||
# Use thinking block for messages that contain tool calls
|
||||
f.write(f"```thinking\n{content}\n```\n")
|
||||
else:
|
||||
f.write(f"{content}\n\n")
|
||||
|
||||
def log_tool_call(self, tool_name: str, arguments: dict, call_index: int = 1):
|
||||
"""Log a tool execution request."""
|
||||
if not self.enabled:
|
||||
return
|
||||
with open(self.log_path, 'a') as f:
|
||||
f.write(f"\n## [{self._timestamp()}] Tool Call #{call_index}\n\n")
|
||||
f.write(f"**Tool:** `{tool_name}`\n\n")
|
||||
f.write(f"**Arguments:**\n")
|
||||
try:
|
||||
args_json = json.dumps(arguments, indent=2)
|
||||
except Exception:
|
||||
args_json = str(arguments)
|
||||
f.write(f"```json\n{args_json}\n```\n")
|
||||
|
||||
def log_tool_result(self, tool_name: str, result: str, call_index: int = 1, success: bool = True):
|
||||
"""Log a tool execution result."""
|
||||
if not self.enabled:
|
||||
return
|
||||
with open(self.log_path, 'a') as f:
|
||||
f.write(f"\n## [{self._timestamp()}] Tool Result #{call_index}\n\n")
|
||||
status = "✓ Success" if success else "✗ Failed"
|
||||
f.write(f"**Tool:** `{tool_name}` - {status}\n\n")
|
||||
f.write(f"**Output:**\n")
|
||||
f.write(f"```\n{result}\n```\n")
|
||||
|
||||
def log_system(self, message: str):
|
||||
"""Log a system message."""
|
||||
if not self.enabled:
|
||||
return
|
||||
with open(self.log_path, 'a') as f:
|
||||
f.write(f"\n## [{self._timestamp()}] System\n\n")
|
||||
f.write(f"> {message}\n\n")
|
||||
|
||||
|
||||
# Global logger instance (lazy initialization handled per request)
|
||||
_global_logger: Optional[ChatLogger] = None
|
||||
|
||||
|
||||
def get_chat_logger() -> ChatLogger:
|
||||
"""Get the global chat logger instance (creates one if needed)."""
|
||||
global _global_logger
|
||||
if _global_logger is None:
|
||||
_global_logger = ChatLogger()
|
||||
return _global_logger
|
||||
@@ -0,0 +1,286 @@
|
||||
"""Main application runner for Local Swarm.
|
||||
|
||||
Handles the primary application modes: download-only, test, and full server mode.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from typing import Optional
|
||||
|
||||
from models.selector import select_optimal_model, ModelConfig
|
||||
from models.downloader import download_model_for_config
|
||||
from swarm import SwarmManager
|
||||
from api import create_server
|
||||
from api.routes import set_federated_swarm
|
||||
from interactive import (
|
||||
interactive_model_selection,
|
||||
show_startup_summary,
|
||||
show_runtime_menu,
|
||||
)
|
||||
from network import create_discovery_service, FederatedSwarm
|
||||
from tools.executor import ToolExecutor, set_tool_executor
|
||||
from utils.network import get_local_ip
|
||||
|
||||
|
||||
class MainRunner:
|
||||
"""Runs the main application logic."""
|
||||
|
||||
def __init__(self, hardware, args):
|
||||
"""Initialize the main runner.
|
||||
|
||||
Args:
|
||||
hardware: Hardware profile
|
||||
args: Parsed command line arguments
|
||||
"""
|
||||
self.hardware = hardware
|
||||
self.args = args
|
||||
self.config: Optional[ModelConfig] = None
|
||||
self.swarm: Optional[SwarmManager] = None
|
||||
self.discovery = None
|
||||
self.federated_swarm = None
|
||||
self.mcp_server = None
|
||||
|
||||
async def run(self) -> int:
|
||||
"""Run the main application.
|
||||
|
||||
Returns:
|
||||
Exit code (0 for success, 1 for error)
|
||||
"""
|
||||
# Get configuration
|
||||
self.config = self._get_configuration()
|
||||
if not self.config:
|
||||
return 1
|
||||
|
||||
# Handle download-only mode
|
||||
if self.args.download_only:
|
||||
return await self._run_download_mode()
|
||||
|
||||
# Handle test mode
|
||||
if self.args.test:
|
||||
return await self._run_test_mode()
|
||||
|
||||
# Run full server mode
|
||||
return await self._run_server_mode()
|
||||
|
||||
def _get_configuration(self) -> Optional[ModelConfig]:
|
||||
"""Get the model configuration."""
|
||||
if self.args.model or self.args.instances or self.args.auto:
|
||||
return self._get_auto_config()
|
||||
else:
|
||||
return interactive_model_selection(self.hardware)
|
||||
|
||||
def _get_auto_config(self) -> Optional[ModelConfig]:
|
||||
"""Get auto-detected configuration."""
|
||||
print("\n📊 Calculating optimal configuration...")
|
||||
try:
|
||||
config = select_optimal_model(
|
||||
self.hardware,
|
||||
preferred_model=self.args.model,
|
||||
force_instances=self.args.instances,
|
||||
use_mlx=None # Auto-detect based on hardware
|
||||
)
|
||||
|
||||
if not config:
|
||||
print("\n❌ No suitable model found for your hardware")
|
||||
print(" Minimum requirement: 2 GB available memory")
|
||||
return None
|
||||
|
||||
print(f"\n✓ Selected: {config.display_name}")
|
||||
print(f" Instances: {config.instances}")
|
||||
print(f" Memory: {config.total_memory_gb:.1f} GB")
|
||||
return config
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error selecting model: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
async def _run_download_mode(self) -> int:
|
||||
"""Run download-only mode."""
|
||||
print("\n" + "=" * 70)
|
||||
print("⬇️ Download Mode: Downloading model only")
|
||||
print("=" * 70)
|
||||
|
||||
try:
|
||||
model_path = download_model_for_config(self.config)
|
||||
print(f"✓ Model downloaded to: {model_path}")
|
||||
print("\n" + "=" * 70)
|
||||
print("✅ Download complete")
|
||||
print("=" * 70)
|
||||
return 0
|
||||
except Exception as e:
|
||||
print(f"\n❌ Download failed: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
async def _run_test_mode(self) -> int:
|
||||
"""Run test mode with sample prompt."""
|
||||
from cli.test_runner import run_test
|
||||
return await run_test(self.hardware, self.config)
|
||||
|
||||
async def _run_server_mode(self) -> int:
|
||||
"""Run full server mode."""
|
||||
show_startup_summary(self.hardware, self.config)
|
||||
|
||||
# Setup swarm
|
||||
if not await self._setup_swarm():
|
||||
return 1
|
||||
|
||||
# Initialize tool executor
|
||||
self._setup_tool_executor()
|
||||
|
||||
# Show updated summary with runtime info
|
||||
show_startup_summary(self.hardware, self.config, self.swarm)
|
||||
|
||||
# Initialize federation if enabled
|
||||
if self.args.federation:
|
||||
await self._setup_federation()
|
||||
|
||||
# Start MCP server if enabled
|
||||
if self.args.mcp:
|
||||
await self._setup_mcp()
|
||||
|
||||
# Run server
|
||||
return await self._run_server()
|
||||
|
||||
async def _setup_swarm(self) -> bool:
|
||||
"""Setup the swarm.
|
||||
|
||||
Returns:
|
||||
True if successful
|
||||
"""
|
||||
print("\n⬇️ Downloading model...")
|
||||
try:
|
||||
model_path = download_model_for_config(self.config)
|
||||
print(f"✓ Model ready at: {model_path}")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error downloading model: {e}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
print("\n🚀 Initializing swarm...")
|
||||
try:
|
||||
self.swarm = SwarmManager(
|
||||
model_config=self.config,
|
||||
hardware=self.hardware,
|
||||
consensus_strategy="similarity"
|
||||
)
|
||||
|
||||
success = await self.swarm.initialize(str(model_path))
|
||||
if not success:
|
||||
print("❌ Failed to initialize swarm")
|
||||
return False
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error initializing swarm: {e}", file=sys.stderr)
|
||||
return False
|
||||
|
||||
def _setup_tool_executor(self) -> None:
|
||||
"""Setup the tool executor."""
|
||||
if self.args.tool_host is not None:
|
||||
if self.args.tool_host == "":
|
||||
tool_host_url = f"http://{get_local_ip()}:17616"
|
||||
print(f"\n🔧 Using remote tool host: {tool_host_url} (auto-detected)")
|
||||
else:
|
||||
tool_host_url = self.args.tool_host
|
||||
print(f"\n🔧 Using remote tool host: {tool_host_url}")
|
||||
executor = ToolExecutor(tool_host_url=tool_host_url)
|
||||
else:
|
||||
executor = ToolExecutor(tool_host_url=None)
|
||||
print("\n🔧 Tool Server: Local")
|
||||
|
||||
set_tool_executor(executor)
|
||||
|
||||
async def _setup_federation(self) -> None:
|
||||
"""Setup federation."""
|
||||
print("\n🌐 Initializing federation...")
|
||||
try:
|
||||
advertise_ip = self.args.host if self.args.host else None
|
||||
self.discovery = await create_discovery_service(
|
||||
self.args.port,
|
||||
advertise_ip=advertise_ip
|
||||
)
|
||||
|
||||
swarm_info = {
|
||||
"version": "0.1.0",
|
||||
"instances": self.config.instances,
|
||||
"model_id": self.config.model_id,
|
||||
"hardware_summary": f"{self.hardware.cpu_cores} CPU, {self.hardware.ram_gb:.1f}GB RAM"
|
||||
}
|
||||
|
||||
await self.discovery.start_advertising(swarm_info)
|
||||
await self.discovery.start_listening()
|
||||
|
||||
# Add manual peers
|
||||
if self.args.peers:
|
||||
await self._add_manual_peers()
|
||||
|
||||
self.federated_swarm = FederatedSwarm(self.swarm, self.discovery)
|
||||
set_federated_swarm(self.federated_swarm)
|
||||
|
||||
# Start health check loop
|
||||
asyncio.create_task(
|
||||
self.discovery.start_health_check_loop(interval_seconds=10)
|
||||
)
|
||||
|
||||
print(f" ✓ Federation enabled")
|
||||
print(f" ✓ Discovery active on port {self.discovery.discovery_port}")
|
||||
print(f" ✓ Peer health checks every 10s")
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Failed to initialize federation: {e}")
|
||||
print(" Continuing without federation...")
|
||||
|
||||
async def _add_manual_peers(self) -> None:
|
||||
"""Add manual peers from command line."""
|
||||
print(f" 📍 Adding {len(self.args.peers)} manual peer(s)...")
|
||||
from network.discovery import PeerInfo
|
||||
from datetime import datetime
|
||||
|
||||
for peer_str in self.args.peers:
|
||||
try:
|
||||
host, port = peer_str.rsplit(':', 1)
|
||||
port = int(port)
|
||||
peer = PeerInfo(
|
||||
host=host,
|
||||
port=port,
|
||||
name=f"manual_{host}_{port}",
|
||||
version="0.1.0",
|
||||
instances=0,
|
||||
model_id="unknown",
|
||||
hardware_summary="manual",
|
||||
last_seen=datetime.now()
|
||||
)
|
||||
self.discovery.peers[peer.name] = peer
|
||||
print(f" ✓ Added peer: {host}:{port}")
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Failed to add peer {peer_str}: {e}")
|
||||
|
||||
async def _setup_mcp(self) -> None:
|
||||
"""Setup MCP server."""
|
||||
print("\n🤖 Starting MCP server...")
|
||||
from mcp_server import create_mcp_server
|
||||
self.mcp_server = await create_mcp_server(self.swarm)
|
||||
print(" MCP server active (stdio)")
|
||||
|
||||
async def _run_server(self) -> int:
|
||||
"""Run the API server."""
|
||||
from cli.server_runner import ServerRunner
|
||||
|
||||
runner = ServerRunner(
|
||||
self.swarm,
|
||||
self.discovery,
|
||||
self.federated_swarm,
|
||||
self.args
|
||||
)
|
||||
|
||||
try:
|
||||
return await runner.run()
|
||||
finally:
|
||||
await self._shutdown()
|
||||
|
||||
async def _shutdown(self) -> None:
|
||||
"""Shutdown all services."""
|
||||
if self.federated_swarm:
|
||||
await self.federated_swarm.close()
|
||||
if self.discovery:
|
||||
await self.discovery.stop()
|
||||
if self.swarm:
|
||||
await self.swarm.shutdown()
|
||||
@@ -0,0 +1,151 @@
|
||||
"""CLI argument parsing for Local Swarm."""
|
||||
|
||||
import argparse
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def create_parser() -> argparse.ArgumentParser:
|
||||
"""Create and configure the argument parser."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Local Swarm - AI-powered coding LLM swarm",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python main.py # Interactive setup and start
|
||||
python main.py --auto # Auto-detect and start without menu
|
||||
python main.py --detect # Show hardware detection only
|
||||
python main.py --model qwen:3b:q4 # Use specific model (skip menu)
|
||||
python main.py --port 17615 # Use custom port (default: 17615)
|
||||
python main.py --host 192.168.1.5 # Bind to specific IP
|
||||
python main.py --instances 4 # Force number of instances
|
||||
python main.py --download-only # Download model only
|
||||
python main.py --test # Test with sample prompt
|
||||
python main.py --mcp # Enable MCP server
|
||||
python main.py --federation # Enable federation with other instances
|
||||
python main.py --federation --peer 192.168.1.10:17615 # Manual peer
|
||||
"""
|
||||
)
|
||||
|
||||
# Mode options
|
||||
parser.add_argument(
|
||||
"--auto",
|
||||
action="store_true",
|
||||
help="Auto-detect best configuration without interactive menu"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--detect",
|
||||
action="store_true",
|
||||
help="Show hardware detection and exit"
|
||||
)
|
||||
|
||||
# Model options
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
help="Model to use (format: name:size:quant, e.g., qwen:3b:q4)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--instances",
|
||||
type=int,
|
||||
help="Force number of instances (overrides auto-calculation)"
|
||||
)
|
||||
|
||||
# Server options
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=17615,
|
||||
help="Port to run the API server on (default: 17615)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--host",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Host IP to bind to (default: auto-detect)"
|
||||
)
|
||||
|
||||
# Operation modes
|
||||
parser.add_argument(
|
||||
"--download-only",
|
||||
action="store_true",
|
||||
help="Download models only, don't start server"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
action="store_true",
|
||||
help="Test with a sample prompt"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mcp",
|
||||
action="store_true",
|
||||
help="Enable MCP server alongside HTTP API"
|
||||
)
|
||||
|
||||
# Configuration
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
default="config.yaml",
|
||||
help="Path to config file"
|
||||
)
|
||||
|
||||
# Federation options
|
||||
parser.add_argument(
|
||||
"--federation",
|
||||
action="store_true",
|
||||
help="Enable federation with other Local Swarm instances on the network"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--peer",
|
||||
action="append",
|
||||
dest="peers",
|
||||
help="Manually add a peer (format: host:port, can be used multiple times)"
|
||||
)
|
||||
|
||||
# Tool server options
|
||||
parser.add_argument(
|
||||
"--tool-server",
|
||||
action="store_true",
|
||||
help="Run as dedicated tool execution server (executes read/write/bash tools)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tool-port",
|
||||
type=int,
|
||||
default=17616,
|
||||
help="Port for tool execution server (default: 17616)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tool-host",
|
||||
type=str,
|
||||
default=None,
|
||||
nargs='?',
|
||||
const='',
|
||||
help="URL of tool execution server. Use without value for auto-detected local IP"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use-opencode-tools",
|
||||
action="store_true",
|
||||
help="Use opencode's tool definitions (~27k tokens). Default: use local tool server"
|
||||
)
|
||||
|
||||
# Version
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
action="version",
|
||||
version="%(prog)s 0.1.0"
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def parse_args(args: Optional[list] = None):
|
||||
"""Parse command line arguments.
|
||||
|
||||
Args:
|
||||
args: Command line arguments (defaults to sys.argv)
|
||||
|
||||
Returns:
|
||||
Parsed arguments namespace
|
||||
"""
|
||||
parser = create_parser()
|
||||
return parser.parse_args(args)
|
||||
@@ -0,0 +1,103 @@
|
||||
"""Server runner for Local Swarm."""
|
||||
|
||||
import asyncio
|
||||
from typing import Optional
|
||||
|
||||
from api import create_server
|
||||
from api.routes import set_federated_swarm
|
||||
from utils.network import get_local_ip
|
||||
|
||||
|
||||
class ServerRunner:
|
||||
"""Handles server startup and shutdown."""
|
||||
|
||||
def __init__(self, swarm, discovery, federated_swarm, args):
|
||||
"""Initialize server runner.
|
||||
|
||||
Args:
|
||||
swarm: Swarm manager instance
|
||||
discovery: Discovery service (optional)
|
||||
federated_swarm: Federated swarm (optional)
|
||||
args: Command line arguments
|
||||
"""
|
||||
self.swarm = swarm
|
||||
self.discovery = discovery
|
||||
self.federated_swarm = federated_swarm
|
||||
self.args = args
|
||||
self.mcp_server = None
|
||||
|
||||
async def run(self) -> int:
|
||||
"""Run the server.
|
||||
|
||||
Returns:
|
||||
Exit code
|
||||
"""
|
||||
print("\n🌐 Starting HTTP API server...")
|
||||
|
||||
# Determine host
|
||||
host = self._get_host()
|
||||
|
||||
# Show tool mode
|
||||
self._show_tool_mode()
|
||||
|
||||
# Create and start server
|
||||
server = create_server(
|
||||
self.swarm,
|
||||
host=host,
|
||||
port=self.args.port,
|
||||
use_opencode_tools=self.args.use_opencode_tools
|
||||
)
|
||||
|
||||
self._print_connection_info(host)
|
||||
|
||||
# Start server
|
||||
try:
|
||||
await server.start()
|
||||
finally:
|
||||
await self._shutdown()
|
||||
|
||||
return 0
|
||||
|
||||
def _get_host(self) -> str:
|
||||
"""Get the host to bind to."""
|
||||
if self.args.host:
|
||||
print(f"🔗 Using specified host: {self.args.host}:{self.args.port}")
|
||||
return self.args.host
|
||||
else:
|
||||
host = get_local_ip()
|
||||
print(f"🔗 Binding to {host}:{self.args.port}")
|
||||
return host
|
||||
|
||||
def _show_tool_mode(self) -> None:
|
||||
"""Display tool mode information."""
|
||||
if self.args.use_opencode_tools:
|
||||
print(f"🔧 Tool mode: opencode tools (~27k tokens)")
|
||||
else:
|
||||
print(f"🔧 Tool mode: local tool server (~125 tokens)")
|
||||
|
||||
def _print_connection_info(self, host: str) -> None:
|
||||
"""Print server connection information."""
|
||||
print(f"\n✅ Local Swarm is running!")
|
||||
print(f" API: http://{host}:{self.args.port}/v1")
|
||||
print(f" Health: http://{host}:{self.args.port}/health")
|
||||
|
||||
if self.args.federation and self.discovery:
|
||||
peers = self.discovery.get_peers()
|
||||
print(f"\n🌐 Federation: Enabled")
|
||||
print(f" Discovery port: {self.discovery.discovery_port}")
|
||||
if peers:
|
||||
print(f" Peers discovered: {len(peers)}")
|
||||
|
||||
print(f"\n💡 Configure opencode to use:")
|
||||
print(f' base_url: http://127.0.0.1:{self.args.port}/v1')
|
||||
print(f' api_key: any (not used)')
|
||||
print(f"\nPress Ctrl+C to stop...\n")
|
||||
|
||||
async def _shutdown(self) -> None:
|
||||
"""Shutdown all services."""
|
||||
if self.federated_swarm:
|
||||
await self.federated_swarm.close()
|
||||
if self.discovery:
|
||||
await self.discovery.stop()
|
||||
if self.swarm:
|
||||
await self.swarm.shutdown()
|
||||
@@ -0,0 +1,81 @@
|
||||
"""Test mode runner for Local Swarm."""
|
||||
|
||||
import asyncio
|
||||
from models.downloader import download_model_for_config
|
||||
from swarm import SwarmManager
|
||||
from interactive import show_startup_summary
|
||||
|
||||
|
||||
async def run_test(hardware, config) -> int:
|
||||
"""Run test mode with sample prompt.
|
||||
|
||||
Args:
|
||||
hardware: Hardware profile
|
||||
config: Model configuration
|
||||
|
||||
Returns:
|
||||
Exit code (0 for success, 1 for error)
|
||||
"""
|
||||
print("\n" + "=" * 70)
|
||||
print("🧪 Test Mode: Running sample inference")
|
||||
print("=" * 70)
|
||||
|
||||
show_startup_summary(hardware, config)
|
||||
|
||||
# Download model
|
||||
print("\n⬇️ Downloading model...")
|
||||
try:
|
||||
model_path = download_model_for_config(config)
|
||||
print(f"✓ Model ready at: {model_path}")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error downloading model: {e}")
|
||||
return 1
|
||||
|
||||
# Initialize swarm
|
||||
print("\n🚀 Initializing swarm...")
|
||||
try:
|
||||
swarm = SwarmManager(
|
||||
model_config=config,
|
||||
hardware=hardware,
|
||||
consensus_strategy="similarity"
|
||||
)
|
||||
|
||||
success = await swarm.initialize(str(model_path))
|
||||
if not success:
|
||||
print("❌ Failed to initialize swarm")
|
||||
return 1
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error initializing swarm: {e}")
|
||||
return 1
|
||||
|
||||
try:
|
||||
# Test prompt
|
||||
prompt = "Write a Python function to calculate factorial:"
|
||||
print(f"\nPrompt: {prompt}\n")
|
||||
print("Generating responses...\n")
|
||||
|
||||
result = await swarm.generate(prompt, max_tokens=200)
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("SELECTED RESPONSE:")
|
||||
print("=" * 70)
|
||||
print(result.selected_response.text)
|
||||
print("\n" + "=" * 70)
|
||||
print(f"Strategy: {result.strategy}")
|
||||
print(f"Confidence: {result.confidence:.2f}")
|
||||
print(f"Latency: {result.selected_response.latency_ms:.1f}ms")
|
||||
print(f"Tokens/sec: {result.selected_response.tokens_per_second:.1f}")
|
||||
|
||||
# Show all responses
|
||||
print("\nAll responses received:")
|
||||
for i, resp in enumerate(result.all_responses):
|
||||
preview = resp.text[:60].replace('\n', ' ')
|
||||
print(f" Worker {i}: {preview}... ({resp.latency_ms:.1f}ms)")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("✅ Test complete")
|
||||
print("=" * 70)
|
||||
return 0
|
||||
|
||||
finally:
|
||||
await swarm.shutdown()
|
||||
@@ -0,0 +1,69 @@
|
||||
"""Tool server for Local Swarm.
|
||||
|
||||
Standalone tool execution server for distributed setups.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI
|
||||
import uvicorn
|
||||
|
||||
from tools.executor import ToolExecutor, set_tool_executor
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def create_tool_server_app() -> FastAPI:
|
||||
"""Create the tool server FastAPI application.
|
||||
|
||||
Returns:
|
||||
Configured FastAPI application
|
||||
"""
|
||||
app = FastAPI(title="Local Swarm Tool Server")
|
||||
|
||||
@app.post("/v1/tools/execute")
|
||||
async def execute_tool(request: dict):
|
||||
tool_name = request.get("tool", "")
|
||||
tool_args = request.get("arguments", {})
|
||||
|
||||
# Get the global executor
|
||||
from tools.executor import get_tool_executor
|
||||
executor = get_tool_executor()
|
||||
|
||||
if executor is None:
|
||||
return {"result": "Error: No tool executor configured"}
|
||||
|
||||
result = await executor.execute(tool_name, tool_args)
|
||||
return {"result": result}
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "healthy", "mode": "tool-server"}
|
||||
|
||||
return app
|
||||
|
||||
|
||||
async def run_tool_server(host: str, port: int) -> None:
|
||||
"""Run the tool server.
|
||||
|
||||
Args:
|
||||
host: Host to bind to
|
||||
port: Port to listen on
|
||||
"""
|
||||
# Initialize local tool executor
|
||||
tool_executor = ToolExecutor(tool_host_url=None)
|
||||
set_tool_executor(tool_executor)
|
||||
|
||||
app = create_tool_server_app()
|
||||
|
||||
print(f"🔗 Tool server running at http://{host}:{port}")
|
||||
print(f" Endpoints:")
|
||||
print(f" - POST /v1/tools/execute")
|
||||
print(f" - GET /health")
|
||||
print(f"\n✅ Tool server ready!")
|
||||
|
||||
config = uvicorn.Config(app, host=host, port=port, log_level="warning")
|
||||
server = uvicorn.Server(config)
|
||||
await server.serve()
|
||||
+132
-2
@@ -2,6 +2,7 @@
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, List
|
||||
import os
|
||||
import platform
|
||||
import psutil
|
||||
|
||||
@@ -17,6 +18,8 @@ class GPUInfo:
|
||||
is_nvidia: bool = False
|
||||
is_amd: bool = False
|
||||
is_mobile: bool = False
|
||||
compute_capability: Optional[str] = None # CUDA compute capability
|
||||
device_count: int = 1 # Number of GPUs available
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -70,10 +73,55 @@ class HardwareProfile:
|
||||
return self.available_memory_gb
|
||||
|
||||
|
||||
def is_android() -> bool:
|
||||
"""Check if running on Android (beyond just Termux)."""
|
||||
# Check multiple Android indicators
|
||||
|
||||
# 1. Check for Android-specific environment variables
|
||||
android_env_vars = [
|
||||
"ANDROID_ROOT",
|
||||
"ANDROID_DATA",
|
||||
"ANDROID_ART_ROOT",
|
||||
"ANDROID_I18N_ROOT",
|
||||
"ANDROID_TZDATA_ROOT",
|
||||
]
|
||||
if any(os.environ.get(var) for var in android_env_vars):
|
||||
return True
|
||||
|
||||
# 2. Check for Android-specific paths
|
||||
android_paths = [
|
||||
"/system/build.prop",
|
||||
"/system/bin/app_process",
|
||||
"/data/data",
|
||||
]
|
||||
if any(os.path.exists(path) for path in android_paths):
|
||||
return True
|
||||
|
||||
# 3. Check for Termux (which runs on Android)
|
||||
if _is_android_or_termux():
|
||||
return True
|
||||
|
||||
# 4. Check /proc/sys/kernel/osrelease for Android
|
||||
try:
|
||||
if os.path.exists("/proc/sys/kernel/osrelease"):
|
||||
with open("/proc/sys/kernel/osrelease", "r") as f:
|
||||
release = f.read().lower()
|
||||
if "android" in release:
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def detect_os() -> str:
|
||||
"""Detect the operating system."""
|
||||
system = platform.system().lower()
|
||||
if system == "darwin":
|
||||
|
||||
# Check for Android first (reports as Linux)
|
||||
if system == "linux" and is_android():
|
||||
return "android"
|
||||
elif system == "darwin":
|
||||
return "darwin"
|
||||
elif system == "windows":
|
||||
return "windows"
|
||||
@@ -132,6 +180,14 @@ def detect_nvidia_gpu() -> Optional[GPUInfo]:
|
||||
except Exception:
|
||||
driver = None
|
||||
|
||||
# Get compute capability
|
||||
compute_capability = None
|
||||
try:
|
||||
major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle)
|
||||
compute_capability = f"{major}.{minor}"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return GPUInfo(
|
||||
name=name,
|
||||
vram_gb=vram_gb,
|
||||
@@ -139,7 +195,9 @@ def detect_nvidia_gpu() -> Optional[GPUInfo]:
|
||||
device_id=0,
|
||||
is_nvidia=True,
|
||||
is_apple_silicon=False,
|
||||
is_amd=False
|
||||
is_amd=False,
|
||||
compute_capability=compute_capability,
|
||||
device_count=device_count
|
||||
)
|
||||
finally:
|
||||
pynvml.nvmlShutdown()
|
||||
@@ -219,6 +277,78 @@ def detect_gpu() -> Optional[GPUInfo]:
|
||||
return None
|
||||
|
||||
|
||||
def calculate_gpu_layers(gpu: Optional[GPUInfo]) -> int:
|
||||
"""Calculate optimal number of GPU layers to offload.
|
||||
|
||||
Args:
|
||||
gpu: GPU information (None if no GPU)
|
||||
|
||||
Returns:
|
||||
Number of layers to offload (-1 = all, 0 = CPU only)
|
||||
"""
|
||||
if gpu is None:
|
||||
return 0
|
||||
|
||||
if gpu.is_apple_silicon:
|
||||
# Apple Silicon: offload all layers (unified memory)
|
||||
return -1
|
||||
|
||||
if gpu.is_nvidia:
|
||||
# NVIDIA: Check compute capability for compatibility
|
||||
if gpu.compute_capability:
|
||||
major, _ = gpu.compute_capability.split('.')
|
||||
if int(major) < 5:
|
||||
# Very old GPUs (Kepler and earlier) may have issues
|
||||
return 0
|
||||
|
||||
# Multi-GPU support: use device_count to determine layers
|
||||
# For now, offload all layers if we have any NVIDIA GPU
|
||||
return -1
|
||||
|
||||
if gpu.is_amd:
|
||||
# AMD: ROCm support varies, be conservative
|
||||
return -1
|
||||
|
||||
# Unknown GPU type: use CPU
|
||||
return 0
|
||||
|
||||
|
||||
def validate_gpu_layers(requested_layers: int, gpu: Optional[GPUInfo]) -> int:
|
||||
"""Validate and adjust requested GPU layers.
|
||||
|
||||
Args:
|
||||
requested_layers: Requested number of layers (-1 = all)
|
||||
gpu: GPU information
|
||||
|
||||
Returns:
|
||||
Validated layer count
|
||||
"""
|
||||
if requested_layers == 0:
|
||||
return 0
|
||||
|
||||
if gpu is None:
|
||||
if requested_layers != 0:
|
||||
raise ValueError(
|
||||
f"Requested {requested_layers} GPU layers but no GPU detected. "
|
||||
"Use n_gpu_layers=0 for CPU-only mode."
|
||||
)
|
||||
return 0
|
||||
|
||||
if gpu.is_apple_silicon:
|
||||
# Apple Silicon always uses all layers
|
||||
return -1
|
||||
|
||||
if gpu.is_nvidia and gpu.compute_capability:
|
||||
major, _ = gpu.compute_capability.split('.')
|
||||
if int(major) < 5:
|
||||
raise ValueError(
|
||||
f"NVIDIA GPU {gpu.name} has compute capability {gpu.compute_capability}. "
|
||||
f"Minimum required is 5.0. Use n_gpu_layers=0 for CPU mode."
|
||||
)
|
||||
|
||||
return requested_layers
|
||||
|
||||
|
||||
def detect_hardware() -> HardwareProfile:
|
||||
"""Detect complete hardware profile."""
|
||||
os_name = detect_os()
|
||||
|
||||
@@ -10,6 +10,64 @@ from typing import Optional
|
||||
from hardware.detector import GPUInfo
|
||||
|
||||
|
||||
# Android-specific file paths for common operations
|
||||
ANDROID_PATHS = {
|
||||
"termux_home": "/data/data/com.termux/files/home",
|
||||
"termux_usr": "/data/data/com.termux/files/usr",
|
||||
"termux_bin": "/data/data/com.termux/files/usr/bin",
|
||||
"shared_storage": "/sdcard",
|
||||
"android_data": "/data/data",
|
||||
}
|
||||
|
||||
|
||||
def get_android_path(path_type: str, subpath: str = "") -> str:
|
||||
"""Get Android-specific file path.
|
||||
|
||||
Args:
|
||||
path_type: Type of path (termux_home, shared_storage, etc.)
|
||||
subpath: Additional path components
|
||||
|
||||
Returns:
|
||||
Full path string
|
||||
"""
|
||||
base = ANDROID_PATHS.get(path_type, path_type)
|
||||
if subpath:
|
||||
return os.path.join(base, subpath)
|
||||
return base
|
||||
|
||||
|
||||
def normalize_path_for_android(path: str) -> str:
|
||||
"""Normalize a path for Android/Termux environment.
|
||||
|
||||
Args:
|
||||
path: Original path
|
||||
|
||||
Returns:
|
||||
Normalized path for Android
|
||||
"""
|
||||
# Expand user home directory properly on Android
|
||||
if path.startswith("~/"):
|
||||
if is_termux():
|
||||
home = ANDROID_PATHS["termux_home"]
|
||||
else:
|
||||
home = os.environ.get("HOME", "/")
|
||||
path = os.path.join(home, path[2:])
|
||||
|
||||
# Handle /sdcard paths
|
||||
if path.startswith("/sdcard") and not os.path.exists("/sdcard"):
|
||||
# Try alternative storage paths
|
||||
alternatives = [
|
||||
"/storage/emulated/0",
|
||||
"/storage/self/primary",
|
||||
]
|
||||
for alt in alternatives:
|
||||
if os.path.exists(alt):
|
||||
path = path.replace("/sdcard", alt, 1)
|
||||
break
|
||||
|
||||
return os.path.normpath(path)
|
||||
|
||||
|
||||
def is_termux() -> bool:
|
||||
"""Check if running in Termux environment."""
|
||||
return (
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
"""Configuration selection for Local Swarm interactive mode."""
|
||||
|
||||
from typing import List, Optional, Tuple
|
||||
from hardware.detector import HardwareProfile
|
||||
from models.registry import Model, list_models
|
||||
from models.selector import ModelConfig, select_optimal_model, calculate_max_instances
|
||||
from interactive.ui import print_section, MenuOption, display_menu
|
||||
|
||||
|
||||
def get_recommended_config(
|
||||
hardware: HardwareProfile,
|
||||
context_size: int = 32768,
|
||||
offload_percent: float = 0.0
|
||||
) -> Optional[ModelConfig]:
|
||||
"""Get the recommended configuration for the hardware with context and offload settings."""
|
||||
use_mlx = hardware.is_apple_silicon if hardware else False
|
||||
return select_optimal_model(
|
||||
hardware,
|
||||
context_size=context_size,
|
||||
offload_percent=offload_percent,
|
||||
use_mlx=use_mlx
|
||||
)
|
||||
|
||||
|
||||
def list_available_configurations(
|
||||
hardware: HardwareProfile,
|
||||
context_size: int = 32768,
|
||||
offload_percent: float = 0.0
|
||||
) -> List[Tuple[str, ModelConfig]]:
|
||||
"""List all feasible configurations for the hardware with context and offload settings."""
|
||||
from models.selector import calculate_memory_with_offload, get_available_memory_with_offload
|
||||
|
||||
configs = []
|
||||
available_vram, available_ram = get_available_memory_with_offload(hardware, offload_percent)
|
||||
|
||||
# Use MLX models on Apple Silicon
|
||||
use_mlx = hardware.is_apple_silicon if hardware else False
|
||||
is_mac = use_mlx
|
||||
|
||||
for model in list_models(use_mlx=use_mlx):
|
||||
for variant in model.variants:
|
||||
for quant in variant.quantizations:
|
||||
# Calculate memory with context and offload
|
||||
if 'bit' in quant.name:
|
||||
quantization_bits = int(quant.name.replace('bit', ''))
|
||||
elif 'q4' in quant.name:
|
||||
quantization_bits = 4
|
||||
elif 'q5' in quant.name:
|
||||
quantization_bits = 5
|
||||
elif 'q6' in quant.name:
|
||||
quantization_bits = 6
|
||||
else:
|
||||
quantization_bits = 4
|
||||
|
||||
vram_per_instance, ram_per_instance = calculate_memory_with_offload(
|
||||
quant.vram_gb, context_size, offload_percent, quantization_bits
|
||||
)
|
||||
|
||||
# Check if at least 1 instance fits in VRAM
|
||||
if vram_per_instance <= available_vram:
|
||||
if is_mac:
|
||||
num_responses = 3
|
||||
total_memory = vram_per_instance + ram_per_instance
|
||||
else:
|
||||
num_responses = calculate_max_instances(available_vram, vram_per_instance)
|
||||
total_memory = (vram_per_instance + ram_per_instance) * num_responses
|
||||
|
||||
config = ModelConfig(
|
||||
model=model,
|
||||
variant=variant,
|
||||
quantization=quant,
|
||||
instances=num_responses,
|
||||
memory_per_instance_gb=vram_per_instance + ram_per_instance,
|
||||
total_memory_gb=total_memory,
|
||||
context_size=context_size,
|
||||
offload_percent=offload_percent,
|
||||
vram_usage_gb=vram_per_instance,
|
||||
ram_usage_gb=ram_per_instance
|
||||
)
|
||||
|
||||
ctx_label = model.context_label
|
||||
label = f"{model.name} [{ctx_label}] {variant.size} ({quant.name})"
|
||||
configs.append((label, config))
|
||||
|
||||
return configs
|
||||
|
||||
|
||||
def select_context_size() -> int:
|
||||
"""Let user select context window size."""
|
||||
print_section("Context Size Selection")
|
||||
print(" Context window determines how much text the model can process at once.")
|
||||
print(" Larger context = more memory usage but can handle longer code files.\n")
|
||||
|
||||
options = [
|
||||
MenuOption("1", "16K tokens", "Good for small code files"),
|
||||
MenuOption("2", "32K tokens (Recommended)", "Best balance for most users"),
|
||||
MenuOption("3", "64K tokens", "Large codebases"),
|
||||
MenuOption("4", "128K tokens", "Very large files (uses more memory)"),
|
||||
]
|
||||
|
||||
choice = display_menu(options, "Select Context Size")
|
||||
|
||||
context_map = {"1": 16384, "2": 32768, "3": 65536, "4": 131072}
|
||||
return context_map.get(choice, 32768)
|
||||
|
||||
|
||||
def select_offload_option() -> float:
|
||||
"""Let user select offloading option."""
|
||||
print_section("Memory Offloading")
|
||||
print(" Offloading moves some model layers to system RAM.")
|
||||
print(" This allows larger models/contexts but may be slower.\n")
|
||||
|
||||
options = [
|
||||
MenuOption("1", "No offload (Default)", "100% GPU VRAM - fastest"),
|
||||
MenuOption("2", "20% offload", "80% GPU + 20% RAM - balanced"),
|
||||
MenuOption("3", "50% offload", "50% GPU + 50% RAM - maximum capacity"),
|
||||
]
|
||||
|
||||
choice = display_menu(options, "Select Offloading")
|
||||
|
||||
offload_map = {"1": 0.0, "2": 0.2, "3": 0.5}
|
||||
return offload_map.get(choice, 0.0)
|
||||
@@ -0,0 +1,87 @@
|
||||
"""Display functions for Local Swarm interactive mode.
|
||||
|
||||
Hardware info and resource usage display.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
from hardware.detector import HardwareProfile
|
||||
from interactive.ui import print_section
|
||||
|
||||
|
||||
def print_hardware_info(hardware: HardwareProfile) -> None:
|
||||
"""Print detailed hardware information."""
|
||||
print_section("Hardware Detection")
|
||||
|
||||
print(f" Operating System: {hardware.os.capitalize()}")
|
||||
print(f" CPU: {hardware.cpu_cores} cores")
|
||||
print(f" System RAM: {hardware.ram_gb:.1f} GB")
|
||||
print(f" Available RAM: {hardware.ram_available_gb:.1f} GB")
|
||||
|
||||
if hardware.gpu:
|
||||
print(f"\n GPU Detected:")
|
||||
print(f" Name: {hardware.gpu.name}")
|
||||
if hardware.is_apple_silicon:
|
||||
print(f" Type: Apple Silicon (Unified Memory)")
|
||||
print(f" Total Memory: {hardware.gpu.vram_gb:.1f} GB")
|
||||
else:
|
||||
print(f" Type: {hardware.gpu.name}")
|
||||
print(f" VRAM: {hardware.gpu.vram_gb:.1f} GB")
|
||||
if hardware.gpu.driver_version:
|
||||
print(f" Driver: {hardware.gpu.driver_version}")
|
||||
else:
|
||||
print(f"\n GPU: None detected (CPU-only mode)")
|
||||
|
||||
if hardware.has_dedicated_gpu:
|
||||
# Dedicated GPU: hard limit based on VRAM
|
||||
print(f"\n Available for LLMs: {hardware.available_memory_gb:.1f} GB")
|
||||
print(f" (Using 100% of GPU VRAM minus buffer)")
|
||||
elif hardware.is_apple_silicon:
|
||||
# Apple Silicon: show recommendation vs limit (like CPU-only)
|
||||
print(f"\n Recommended for LLMs: {hardware.recommended_memory_gb:.1f} GB (50% of unified memory)")
|
||||
print(f" Maximum available: {hardware.available_memory_gb:.1f} GB (unified memory - 4GB safety)")
|
||||
else:
|
||||
# CPU-only: show recommendation vs limit
|
||||
print(f"\n Recommended for LLMs: {hardware.recommended_memory_gb:.1f} GB (50% of RAM)")
|
||||
print(f" Maximum available: {hardware.available_memory_gb:.1f} GB (system RAM - 4GB safety)")
|
||||
|
||||
|
||||
def print_resource_usage(swarm_manager) -> None:
|
||||
"""Print current resource usage if swarm is running."""
|
||||
if swarm_manager is None:
|
||||
return
|
||||
|
||||
print_section("Current Resource Usage")
|
||||
|
||||
status = swarm_manager.get_status()
|
||||
workers = swarm_manager.get_worker_info()
|
||||
|
||||
print(f" Swarm Status: {'Running' if status.is_running else 'Stopped'}")
|
||||
print(f" Model: {status.model_name}")
|
||||
print(f" Workers: {status.healthy_workers}/{status.total_workers} healthy")
|
||||
print(f" Consensus Strategy: {status.strategy}")
|
||||
print(f" Memory Usage: {status.total_memory_gb:.2f} GB")
|
||||
print(f" Memory per Worker: {status.total_memory_gb / status.total_workers:.2f} GB" if status.total_workers > 0 else " Memory per Worker: N/A")
|
||||
|
||||
if workers:
|
||||
print(f"\n Worker Details:")
|
||||
for w in workers:
|
||||
status_icon = "✓" if w.is_healthy else "✗"
|
||||
|
||||
# Show IP for remote workers
|
||||
location = f" [{w.ip_address}]" if w.is_remote and w.ip_address else ""
|
||||
|
||||
print(f" [{status_icon}] {w.name}{location}: {w.backend_name}")
|
||||
|
||||
# Show live data if available
|
||||
if w.is_generating:
|
||||
progress_bar = "█" * int(w.progress / 5) + "░" * (20 - int(w.progress / 5))
|
||||
print(f" 🔄 Generating: {progress_bar} ({w.progress:.0f}%)")
|
||||
print(f" 📏 Context: {w.context_used:,} tokens")
|
||||
if w.last_output:
|
||||
preview = w.last_output[:60].replace('\n', ' ')
|
||||
print(f" 💬 Last: {preview}...")
|
||||
|
||||
if w.stats.total_requests > 0:
|
||||
print(f" 📊 Requests: {w.stats.total_requests}")
|
||||
print(f" ⏱️ Avg Latency: {w.stats.avg_latency_ms:.1f}ms")
|
||||
print(f" 🚀 Tokens/sec: {w.stats.tokens_per_second:.1f}")
|
||||
@@ -0,0 +1,226 @@
|
||||
"""Tips and help content for Local Swarm.
|
||||
|
||||
Educational content about models, quantization, and optimization.
|
||||
"""
|
||||
|
||||
from hardware.detector import HardwareProfile
|
||||
from interactive.ui import clear_screen, print_header, print_section
|
||||
|
||||
|
||||
def show_model_recommendations():
|
||||
"""Display model recommendations."""
|
||||
clear_screen()
|
||||
print_header("Model Recommendations")
|
||||
|
||||
print_section("Best Models for Coding (Ranked)")
|
||||
print("""
|
||||
🥇 Qwen 2.5 Coder - BEST OVERALL
|
||||
• Excellent code completion and generation
|
||||
• Strong performance even at smaller sizes (3B)
|
||||
• Good at following instructions
|
||||
• Recommended for most users
|
||||
|
||||
🥈 DeepSeek Coder - GREAT ALTERNATIVE
|
||||
• Very capable on coding tasks
|
||||
• Good balance of speed and quality
|
||||
• Smaller 1.3B option for low-end hardware
|
||||
|
||||
🥉 CodeLlama - SOLID CHOICE
|
||||
• Meta's dedicated code model
|
||||
• Good performance, widely tested
|
||||
• Larger sizes (13B+) for complex tasks
|
||||
|
||||
Other Good Options:
|
||||
• Llama 3.2 - General model with good coding skills
|
||||
• Phi-4 - Microsoft's efficient small model
|
||||
• Gemma 2 - Google's open model
|
||||
• StarCoder2 - Good for code completion
|
||||
|
||||
Which size to choose?
|
||||
• 1-3B: Fast, good for simple tasks, low VRAM
|
||||
• 7B: Sweet spot for most users
|
||||
• 13-15B: Better quality, needs more VRAM
|
||||
• 30B+: Best quality but very slow
|
||||
""")
|
||||
input("\n Press Enter to continue...")
|
||||
|
||||
|
||||
def show_quantization_guide():
|
||||
"""Display quantization guide."""
|
||||
clear_screen()
|
||||
print_header("Quantization Guide")
|
||||
|
||||
print_section("What is Quantization?")
|
||||
print("""
|
||||
Quantization compresses the model to use less memory.
|
||||
Lower precision = smaller size = faster loading
|
||||
But may reduce quality slightly.
|
||||
""")
|
||||
|
||||
print_section("Quantization Levels")
|
||||
print("""
|
||||
Q4_K_M (Good) - RECOMMENDED FOR MOST USERS
|
||||
• 4-bit quantization with medium quality
|
||||
• ~70% smaller than original
|
||||
• Minimal quality loss for coding
|
||||
• Best speed/memory/quality balance
|
||||
• Use this if unsure!
|
||||
|
||||
Q5_K_M (Better)
|
||||
• 5-bit quantization with better quality
|
||||
• ~60% smaller than original
|
||||
• Better for complex reasoning
|
||||
• Slightly more VRAM needed
|
||||
|
||||
Q6_K (Best)
|
||||
• 6-bit quantization with highest quality
|
||||
• ~50% smaller than original
|
||||
• Close to original model quality
|
||||
• Requires more VRAM
|
||||
• Use if you have plenty of memory
|
||||
|
||||
When to use each:
|
||||
• Q4_K_M: Default choice, works great
|
||||
• Q5_K_M: If you have extra VRAM, want better quality
|
||||
• Q6_K: If VRAM is abundant, want best quality
|
||||
""")
|
||||
|
||||
print_section("Quick Reference")
|
||||
print("""
|
||||
Size comparison for 7B model:
|
||||
• Original (FP16): ~14 GB
|
||||
• Q6_K: ~6 GB
|
||||
• Q5_K_M: ~5.2 GB
|
||||
• Q4_K_M: ~4.5 GB
|
||||
""")
|
||||
input("\n Press Enter to continue...")
|
||||
|
||||
|
||||
def show_instance_tips(hardware: HardwareProfile):
|
||||
"""Display tips for optimal instance count."""
|
||||
clear_screen()
|
||||
print_header("Instance Count Optimization")
|
||||
|
||||
print_section("What Are Instances?")
|
||||
print("""
|
||||
Each instance = one copy of the model running.
|
||||
Multiple instances = multiple workers voting on answers.
|
||||
More instances = better consensus but uses more memory.
|
||||
""")
|
||||
|
||||
print_section("Recommended Instance Counts")
|
||||
print(f"""
|
||||
Based on your hardware ({hardware.available_memory_gb:.1f} GB available):
|
||||
|
||||
Minimum: 2 instances
|
||||
• Required for consensus voting
|
||||
• Detects bad/hallucinated responses
|
||||
• Better than single model
|
||||
|
||||
Good Range: 3-5 instances
|
||||
• Most common setup
|
||||
• Good consensus quality
|
||||
• Reasonable memory usage
|
||||
• Recommended sweet spot
|
||||
|
||||
Maximum: 8 instances
|
||||
• Best consensus quality
|
||||
• Higher memory usage
|
||||
• Diminishing returns after 5-6
|
||||
• Use only if VRAM abundant
|
||||
|
||||
Research Note:
|
||||
Studies show consensus with 3-5 models gives 85-90%
|
||||
of the benefit, with minimal overhead. More than 8
|
||||
provides minimal improvement.
|
||||
""")
|
||||
|
||||
print_section("Memory Calculation Example")
|
||||
print(f"""
|
||||
Your available memory: {hardware.available_memory_gb:.1f} GB
|
||||
|
||||
Example: 7B model at Q4_K_M (4.5 GB per instance)
|
||||
• 2 instances: 9.0 GB used
|
||||
• 3 instances: 13.5 GB used
|
||||
• 4 instances: 18.0 GB used
|
||||
|
||||
Rule of thumb: Leave 10% buffer for overhead
|
||||
""")
|
||||
input("\n Press Enter to continue...")
|
||||
|
||||
|
||||
def show_hardware_tips(hardware: HardwareProfile):
|
||||
"""Display hardware-specific tips."""
|
||||
clear_screen()
|
||||
print_header("Hardware Optimization Tips")
|
||||
|
||||
print_section("Your Hardware Profile")
|
||||
print(f"""
|
||||
OS: {hardware.os.capitalize()}
|
||||
CPU: {hardware.cpu_cores} cores
|
||||
Available Memory: {hardware.available_memory_gb:.1f} GB
|
||||
GPU: {hardware.gpu.name if hardware.gpu else "None (CPU mode)"}
|
||||
""")
|
||||
|
||||
if hardware.is_apple_silicon:
|
||||
print_section("Apple Silicon Tips")
|
||||
print("""
|
||||
✓ Using MLX backend (optimized for Metal)
|
||||
✓ Unified memory architecture
|
||||
✓ 50% of RAM allocated for LLMs
|
||||
|
||||
Tips:
|
||||
• Use Q4_K_M quantization for best balance
|
||||
• 7B models work great on 16GB+ Macs
|
||||
• 3B models good for 8GB Macs
|
||||
• M1/M2/M3 all supported
|
||||
• Close other apps for best performance
|
||||
""")
|
||||
elif hardware.gpu and not hardware.is_apple_silicon:
|
||||
print_section("Discrete GPU Tips")
|
||||
print(f"""
|
||||
✓ GPU: {hardware.gpu.name}
|
||||
✓ Using 100% of VRAM
|
||||
|
||||
Tips:
|
||||
• Install CUDA/ROCm drivers for acceleration
|
||||
• Use Q4_K_M or Q5_K_M quantization
|
||||
• Monitor GPU temperature during long runs
|
||||
• Close GPU-intensive apps (games, etc.)
|
||||
• 7B-13B models work well on 8-16GB VRAM
|
||||
""")
|
||||
else:
|
||||
print_section("CPU-Only Tips")
|
||||
print("""
|
||||
✓ Running in CPU mode
|
||||
✓ 50% of system RAM allocated
|
||||
|
||||
Tips:
|
||||
• Use smaller models (3B-4B) for speed
|
||||
• Use Q4_K_M quantization
|
||||
• Fewer instances (2-3) recommended
|
||||
• Expect slower generation than GPU
|
||||
• Good for testing, not production use
|
||||
• Consider cloud GPU for heavy use
|
||||
""")
|
||||
|
||||
print_section("General Optimization")
|
||||
print("""
|
||||
Speed vs Quality:
|
||||
• Smaller models (3B) = faster, less capable
|
||||
• Larger models (7B+) = slower, smarter
|
||||
• Q4 = faster, less precise
|
||||
• Q6 = slower, more precise
|
||||
|
||||
Memory Management:
|
||||
• Leave 10-20% RAM/VRAM free
|
||||
• Close browsers and heavy apps
|
||||
• Use swap if necessary (slower)
|
||||
|
||||
Best Practices:
|
||||
• Start with recommended config
|
||||
• Test with --test flag first
|
||||
• Monitor memory usage
|
||||
• Adjust instances based on performance
|
||||
""")
|
||||
input("\n Press Enter to continue...")
|
||||
@@ -0,0 +1,63 @@
|
||||
"""UI utilities for Local Swarm interactive mode.
|
||||
|
||||
Terminal display helpers and formatting functions.
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import os
|
||||
from typing import List
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class MenuOption:
|
||||
"""A menu option."""
|
||||
key: str
|
||||
label: str
|
||||
description: str = ""
|
||||
|
||||
|
||||
def clear_screen():
|
||||
"""Clear the terminal screen."""
|
||||
subprocess.run(['cls' if os.name == 'nt' else 'clear'], shell=True, check=False)
|
||||
|
||||
|
||||
def print_header(title: str):
|
||||
"""Print a formatted header."""
|
||||
width = 70
|
||||
print("=" * width)
|
||||
print(f" {title}".ljust(width))
|
||||
print("=" * width)
|
||||
print()
|
||||
|
||||
|
||||
def print_section(title: str):
|
||||
"""Print a section title."""
|
||||
print(f"\n{'─' * 70}")
|
||||
print(f" {title}")
|
||||
print(f"{'─' * 70}")
|
||||
|
||||
|
||||
def display_menu(options: List[MenuOption], title: str = "Menu") -> str:
|
||||
"""Display a menu and return the user's choice.
|
||||
|
||||
Args:
|
||||
options: List of menu options
|
||||
title: Menu title
|
||||
|
||||
Returns:
|
||||
Selected option key
|
||||
"""
|
||||
print_section(title)
|
||||
|
||||
for opt in options:
|
||||
desc = f" - {opt.description}" if opt.description else ""
|
||||
print(f" [{opt.key}] {opt.label}{desc}")
|
||||
|
||||
print()
|
||||
while True:
|
||||
choice = input(" Enter your choice: ").strip().lower()
|
||||
valid_keys = [opt.key.lower() for opt in options]
|
||||
if choice in valid_keys:
|
||||
return choice
|
||||
print(f" Invalid choice. Please enter one of: {', '.join(valid_keys)}")
|
||||
@@ -0,0 +1,107 @@
|
||||
"""Memory calculation utilities for model selection."""
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
|
||||
def calculate_context_memory(context_size: int, quantization_bits: int = 4) -> float:
|
||||
"""Calculate additional memory needed for KV cache based on context size.
|
||||
|
||||
Args:
|
||||
context_size: Number of tokens in context window
|
||||
quantization_bits: Quantization bits (4 for Q4, 5 for Q5, etc.)
|
||||
|
||||
Returns:
|
||||
Additional VRAM needed in GB
|
||||
"""
|
||||
# KV cache memory per token: 2 * num_layers * hidden_dim * bytes_per_param
|
||||
# Rough estimate: ~0.5MB per 1K tokens for 4-bit quantization
|
||||
bytes_per_token = (quantization_bits / 8) * 0.5 # 0.5 MB base per token at fp16
|
||||
memory_mb = (context_size / 1000) * bytes_per_token * 1000
|
||||
return memory_mb / 1024 # Convert to GB
|
||||
|
||||
|
||||
def calculate_memory_with_offload(
|
||||
base_vram_gb: float,
|
||||
context_size: int,
|
||||
offload_percent: float,
|
||||
quantization_bits: int = 4
|
||||
) -> Tuple[float, float]:
|
||||
"""Calculate VRAM and RAM usage with offloading.
|
||||
|
||||
Args:
|
||||
base_vram_gb: Base model VRAM without context
|
||||
context_size: Context window size in tokens
|
||||
offload_percent: Percentage of model offloaded to RAM (0.0-1.0)
|
||||
quantization_bits: Quantization precision
|
||||
|
||||
Returns:
|
||||
(vram_usage_gb, ram_usage_gb)
|
||||
"""
|
||||
# Context memory (KV cache) - always in VRAM for speed
|
||||
context_memory = calculate_context_memory(context_size, quantization_bits)
|
||||
|
||||
# Model weights split between GPU and RAM
|
||||
gpu_model_memory = base_vram_gb * (1 - offload_percent)
|
||||
ram_model_memory = base_vram_gb * offload_percent
|
||||
|
||||
# Context cache stays in VRAM for performance
|
||||
vram_total = gpu_model_memory + context_memory
|
||||
ram_total = ram_model_memory
|
||||
|
||||
return vram_total, ram_total
|
||||
|
||||
|
||||
def get_available_memory_with_offload(
|
||||
hardware,
|
||||
offload_percent: float
|
||||
) -> Tuple[float, float]:
|
||||
"""Get available GPU VRAM and system RAM considering offloading.
|
||||
|
||||
Args:
|
||||
hardware: Hardware profile
|
||||
offload_percent: Offloading percentage
|
||||
|
||||
Returns:
|
||||
(available_vram_gb, available_ram_gb)
|
||||
"""
|
||||
if hardware.gpu and not hardware.is_apple_silicon:
|
||||
# External GPU - use GPU VRAM + potentially some system RAM
|
||||
available_vram = hardware.gpu.vram_gb * 0.9 # 10% buffer
|
||||
available_ram = hardware.ram_gb * 0.5 * offload_percent # Portion of RAM for offload
|
||||
elif hardware.is_apple_silicon:
|
||||
# Apple Silicon - unified memory
|
||||
available_total = hardware.available_memory_gb
|
||||
available_vram = available_total * (1 - offload_percent)
|
||||
available_ram = available_total * offload_percent
|
||||
else:
|
||||
# CPU only - use full available memory (RAM - 4GB safety)
|
||||
available_vram = hardware.available_memory_gb
|
||||
available_ram = 0
|
||||
|
||||
return available_vram, available_ram
|
||||
|
||||
|
||||
def calculate_max_instances(
|
||||
available_vram: float,
|
||||
vram_per_instance: float,
|
||||
optimal: bool = False
|
||||
) -> int:
|
||||
"""Calculate maximum number of instances that fit in available VRAM.
|
||||
|
||||
Args:
|
||||
available_vram: Available VRAM in GB
|
||||
vram_per_instance: VRAM needed per instance
|
||||
optimal: If True, cap at optimal max (5) instead of hard max (8)
|
||||
|
||||
Returns:
|
||||
Maximum number of instances
|
||||
"""
|
||||
from models.selector import MAX_INSTANCES, OPTIMAL_MAX_INSTANCES, MEMORY_OVERHEAD_FACTOR
|
||||
|
||||
if vram_per_instance <= 0:
|
||||
return 1
|
||||
|
||||
max_possible = int((available_vram * MEMORY_OVERHEAD_FACTOR) / vram_per_instance)
|
||||
max_allowed = OPTIMAL_MAX_INSTANCES if optimal else MAX_INSTANCES
|
||||
|
||||
return max(1, min(max_possible, max_allowed))
|
||||
+203
-294
@@ -1,9 +1,12 @@
|
||||
"""Model registry for Local Swarm."""
|
||||
"""Model registry for Local Swarm.
|
||||
|
||||
Loads model data from JSON configuration files.
|
||||
"""
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Dict, List, Optional
|
||||
from pathlib import Path
|
||||
import yaml
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -11,7 +14,7 @@ class QuantizationConfig:
|
||||
"""Configuration for a specific quantization level."""
|
||||
name: str
|
||||
vram_gb: float
|
||||
quality: str # 'fast', 'good', 'better', 'best'
|
||||
quality: str
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"QuantizationConfig({self.name}, {self.vram_gb}GB, {self.quality})"
|
||||
@@ -33,7 +36,6 @@ class ModelVariant:
|
||||
|
||||
def get_best_quantization_for_memory(self, available_gb: float) -> Optional[QuantizationConfig]:
|
||||
"""Get the best quantization that fits in available memory."""
|
||||
# Sort by quality (best first) then by VRAM (smallest first)
|
||||
sorted_quants = sorted(
|
||||
self.quantizations,
|
||||
key=lambda q: (['fast', 'good', 'better', 'best'].index(q.quality), q.vram_gb),
|
||||
@@ -53,8 +55,8 @@ class Model:
|
||||
name: str
|
||||
description: str
|
||||
variants: List[ModelVariant]
|
||||
priority: int = 100 # Lower = higher priority
|
||||
max_context: int = 8192 # Maximum context window in tokens
|
||||
priority: int = 100
|
||||
max_context: int = 8192
|
||||
|
||||
def get_variant(self, size: str) -> Optional[ModelVariant]:
|
||||
"""Get a specific size variant."""
|
||||
@@ -67,7 +69,6 @@ class Model:
|
||||
"""Get the largest variant available."""
|
||||
if not self.variants:
|
||||
return None
|
||||
# Sort by base VRAM (largest first)
|
||||
return sorted(self.variants, key=lambda v: v.base_vram_gb, reverse=True)[0]
|
||||
|
||||
@property
|
||||
@@ -85,315 +86,223 @@ class Model:
|
||||
return f"{self.max_context//1000}K"
|
||||
|
||||
|
||||
# MLX quantization sizes (GB) based on mlx-community models
|
||||
# HARDOCODED: These are verified to exist on HuggingFace mlx-community
|
||||
# Last verified: 2025-02-23
|
||||
# DO NOT make API calls on startup - use this hardcoded list
|
||||
MLX_QUANT_SIZES = {
|
||||
# Format: model_id: {variant_size: {quant_bit: vram_gb}}
|
||||
# Only includes quantizations that actually exist on HF
|
||||
"qwen2.5-coder": {
|
||||
"3b": {"3bit": 1.3, "4bit": 1.7, "6bit": 2.5, "8bit": 3.3},
|
||||
# 5bit does NOT exist for 3b
|
||||
"7b": {"3bit": 3.1, "4bit": 4.1, "6bit": 6.1, "8bit": 8.1},
|
||||
# 5bit does NOT exist for 7b
|
||||
"14b": {"3bit": 6.2, "4bit": 8.2, "6bit": 12.2, "8bit": 16.2},
|
||||
# 5bit does NOT exist for 14b
|
||||
},
|
||||
"deepseek-coder": {
|
||||
"1.3b": {"4bit": 0.8, "6bit": 1.2},
|
||||
# 3bit, 5bit, 8bit do NOT exist
|
||||
"6.7b": {"4bit": 3.9, "6bit": 5.9, "8bit": 7.9},
|
||||
# 3bit, 5bit do NOT exist
|
||||
},
|
||||
"codellama": {
|
||||
"7b": {"4bit": 4.1, "6bit": 6.1, "8bit": 8.1},
|
||||
# 3bit, 5bit do NOT exist
|
||||
"13b": {"4bit": 7.6, "6bit": 11.4, "8bit": 15.2},
|
||||
# 3bit, 5bit do NOT exist
|
||||
},
|
||||
"llama-3.2": {
|
||||
"1b": {"4bit": 0.6, "8bit": 1.2},
|
||||
# 3bit, 5bit, 6bit do NOT exist
|
||||
"3b": {"4bit": 1.8, "6bit": 2.6, "8bit": 3.5},
|
||||
# 3bit, 5bit do NOT exist
|
||||
},
|
||||
"phi-4": {
|
||||
"4b": {"4bit": 2.4, "6bit": 3.6, "8bit": 4.8},
|
||||
# 3bit, 5bit do NOT exist
|
||||
},
|
||||
"gemma-2": {
|
||||
"2b": {"4bit": 1.2, "6bit": 1.8, "8bit": 2.4},
|
||||
# 3bit, 5bit do NOT exist
|
||||
"4b": {"4bit": 2.4, "6bit": 3.6, "8bit": 4.8},
|
||||
# 3bit, 5bit do NOT exist
|
||||
"9b": {"4bit": 5.3, "6bit": 7.9, "8bit": 10.5},
|
||||
# 3bit, 5bit do NOT exist
|
||||
},
|
||||
"starcoder2": {
|
||||
"3b": {"4bit": 1.8, "6bit": 2.6, "8bit": 3.5},
|
||||
# 3bit, 5bit do NOT exist
|
||||
"7b": {"4bit": 4.1, "6bit": 6.1, "8bit": 8.1},
|
||||
# 3bit, 5bit do NOT exist
|
||||
"15b": {"4bit": 8.8, "6bit": 13.2, "8bit": 17.6},
|
||||
# 3bit, 5bit do NOT exist
|
||||
},
|
||||
}
|
||||
|
||||
# Quality mapping for MLX quantizations
|
||||
MLX_QUALITY_MAP = {
|
||||
"3bit": "fast",
|
||||
"4bit": "good",
|
||||
"5bit": "better",
|
||||
"6bit": "best",
|
||||
"8bit": "best",
|
||||
}
|
||||
|
||||
# Base model metadata (without quantization-specific data)
|
||||
MODEL_METADATA = {
|
||||
"qwen2.5-coder": {
|
||||
"name": "Qwen 2.5 Coder",
|
||||
"description": "Alibaba's code-focused model, excellent for small sizes",
|
||||
"priority": 1,
|
||||
"max_context": 128000,
|
||||
"variants": ["3b", "7b", "14b"],
|
||||
},
|
||||
"deepseek-coder": {
|
||||
"name": "DeepSeek Coder",
|
||||
"description": "DeepSeek's code model, good alternative",
|
||||
"priority": 2,
|
||||
"max_context": 16384,
|
||||
"variants": ["1.3b", "6.7b"],
|
||||
},
|
||||
"codellama": {
|
||||
"name": "CodeLlama",
|
||||
"description": "Meta's code model",
|
||||
"priority": 3,
|
||||
"max_context": 16384,
|
||||
"variants": ["7b", "13b"],
|
||||
},
|
||||
"llama-3.2": {
|
||||
"name": "Llama 3.2",
|
||||
"description": "Meta's latest general-purpose model with strong coding abilities",
|
||||
"priority": 4,
|
||||
"max_context": 128000,
|
||||
"variants": ["1b", "3b"],
|
||||
},
|
||||
"phi-4": {
|
||||
"name": "Phi-4",
|
||||
"description": "Microsoft's efficient small model with excellent coding performance",
|
||||
"priority": 5,
|
||||
"max_context": 16384,
|
||||
"variants": ["4b"],
|
||||
},
|
||||
"gemma-2": {
|
||||
"name": "Gemma 2",
|
||||
"description": "Google's open model, good for coding tasks",
|
||||
"priority": 6,
|
||||
"max_context": 8192,
|
||||
"variants": ["2b", "4b", "9b"],
|
||||
},
|
||||
"starcoder2": {
|
||||
"name": "StarCoder2",
|
||||
"description": "BigCode's open code generation model",
|
||||
"priority": 7,
|
||||
"max_context": 8192,
|
||||
"variants": ["3b", "7b", "15b"],
|
||||
},
|
||||
}
|
||||
|
||||
# GGUF quantization sizes (GB) - accurate sizes
|
||||
GGUF_QUANT_SIZES = {
|
||||
"qwen2.5-coder": {
|
||||
"3b": {"q4_k_m": 1.8, "q5_k_m": 2.2, "q6_k": 2.6},
|
||||
"7b": {"q4_k_m": 4.5, "q5_k_m": 5.2, "q6_k": 6.0},
|
||||
"14b": {"q4_k_m": 8.8, "q5_k_m": 10.5},
|
||||
},
|
||||
"deepseek-coder": {
|
||||
"1.3b": {"q4_k_m": 0.8, "q5_k_m": 1.0},
|
||||
"6.7b": {"q4_k_m": 4.2, "q5_k_m": 5.0},
|
||||
},
|
||||
"codellama": {
|
||||
"7b": {"q4_k_m": 4.5, "q5_k_m": 5.2},
|
||||
"13b": {"q4_k_m": 8.0, "q5_k_m": 9.5},
|
||||
},
|
||||
"llama-3.2": {
|
||||
"3b": {"q4_k_m": 1.9, "q5_k_m": 2.3, "q6_k": 2.7},
|
||||
"1b": {"q4_k_m": 0.7, "q5_k_m": 0.9},
|
||||
},
|
||||
"phi-4": {
|
||||
"4b": {"q4_k_m": 2.4, "q5_k_m": 2.9, "q6_k": 3.4},
|
||||
},
|
||||
"gemma-2": {
|
||||
"2b": {"q4_k_m": 1.5, "q5_k_m": 1.8},
|
||||
"4b": {"q4_k_m": 2.7, "q5_k_m": 3.2, "q6_k": 3.8},
|
||||
"9b": {"q4_k_m": 5.5, "q5_k_m": 6.5},
|
||||
},
|
||||
"starcoder2": {
|
||||
"3b": {"q4_k_m": 1.9, "q5_k_m": 2.3},
|
||||
"7b": {"q4_k_m": 4.5, "q5_k_m": 5.2, "q6_k": 6.1},
|
||||
"15b": {"q4_k_m": 9.2, "q5_k_m": 10.8},
|
||||
},
|
||||
}
|
||||
|
||||
# GGUF quality mapping
|
||||
GGUF_QUALITY_MAP = {
|
||||
"q4_k_m": "good",
|
||||
"q5_k_m": "better",
|
||||
"q6_k": "best",
|
||||
}
|
||||
|
||||
|
||||
def get_quantization_sizes(model_id: str, use_mlx: bool = False) -> Dict[str, Dict[str, float]]:
|
||||
"""Get quantization sizes for a model."""
|
||||
if use_mlx:
|
||||
return MLX_QUANT_SIZES.get(model_id, {})
|
||||
else:
|
||||
return GGUF_QUANT_SIZES.get(model_id, {})
|
||||
|
||||
|
||||
def get_quality_map(use_mlx: bool = False) -> Dict[str, str]:
|
||||
"""Get quality mapping for quantizations."""
|
||||
if use_mlx:
|
||||
return MLX_QUALITY_MAP
|
||||
else:
|
||||
return GGUF_QUALITY_MAP
|
||||
|
||||
|
||||
def build_model_variants(model_id: str, use_mlx: bool = False) -> List[ModelVariant]:
|
||||
"""Build model variants with appropriate quantizations for the platform."""
|
||||
metadata = MODEL_METADATA.get(model_id)
|
||||
if not metadata:
|
||||
return []
|
||||
class ModelRegistry:
|
||||
"""Registry for loading and managing models from config files."""
|
||||
|
||||
quality_map = get_quality_map(use_mlx)
|
||||
variants = []
|
||||
def __init__(self):
|
||||
"""Initialize registry and load config files."""
|
||||
self.config_dir = Path(__file__).parent.parent.parent / "config" / "models"
|
||||
self._metadata: Dict = {}
|
||||
self._mlx_sizes: Dict = {}
|
||||
self._gguf_sizes: Dict = {}
|
||||
self._load_configs()
|
||||
|
||||
for variant_size in metadata["variants"]:
|
||||
quant_sizes = get_quantization_sizes(model_id, use_mlx).get(variant_size, {})
|
||||
def _load_configs(self) -> None:
|
||||
"""Load all configuration files."""
|
||||
# Load model metadata
|
||||
metadata_path = self.config_dir / "model_metadata.json"
|
||||
if metadata_path.exists():
|
||||
with open(metadata_path, 'r') as f:
|
||||
self._metadata = json.load(f)
|
||||
|
||||
if not quant_sizes:
|
||||
continue
|
||||
# Load MLX quantization sizes
|
||||
mlx_path = self.config_dir / "mlx_quant_sizes.json"
|
||||
if mlx_path.exists():
|
||||
with open(mlx_path, 'r') as f:
|
||||
self._mlx_sizes = json.load(f)
|
||||
|
||||
quantizations = []
|
||||
for quant_name, vram_gb in quant_sizes.items():
|
||||
quality = quality_map.get(quant_name, "good")
|
||||
quantizations.append(QuantizationConfig(quant_name, vram_gb, quality))
|
||||
|
||||
# Calculate base VRAM (rough estimate: use 8bit or largest quant size)
|
||||
base_vram = max(quant_sizes.values()) * 1.5
|
||||
|
||||
variants.append(ModelVariant(
|
||||
size=variant_size,
|
||||
base_vram_gb=base_vram,
|
||||
quantizations=quantizations
|
||||
))
|
||||
# Load GGUF quantization sizes
|
||||
gguf_path = self.config_dir / "gguf_quant_sizes.json"
|
||||
if gguf_path.exists():
|
||||
with open(gguf_path, 'r') as f:
|
||||
self._gguf_sizes = json.load(f)
|
||||
|
||||
return variants
|
||||
|
||||
|
||||
def build_models(use_mlx: bool = False) -> Dict[str, Model]:
|
||||
"""Build the model registry with platform-appropriate quantizations."""
|
||||
models = {}
|
||||
|
||||
for model_id, metadata in MODEL_METADATA.items():
|
||||
variants = build_model_variants(model_id, use_mlx)
|
||||
def get_model(self, model_id: str, use_mlx: bool = False) -> Optional[Model]:
|
||||
"""Get a model by ID."""
|
||||
if model_id not in self._metadata:
|
||||
return None
|
||||
|
||||
meta = self._metadata[model_id]
|
||||
# Ensure meta is a dict (not a string like "_comment")
|
||||
if not isinstance(meta, dict):
|
||||
return None
|
||||
|
||||
sizes = self._mlx_sizes if use_mlx else self._gguf_sizes
|
||||
quality_map = self._get_quality_map(use_mlx)
|
||||
|
||||
variants = []
|
||||
for variant_size in meta.get("variants", []):
|
||||
quantizations = []
|
||||
size_data = sizes.get(model_id, {}).get(variant_size, {})
|
||||
|
||||
for quant_name, vram_gb in size_data.items():
|
||||
quality = quality_map.get(quant_name, "good")
|
||||
quantizations.append(QuantizationConfig(quant_name, vram_gb, quality))
|
||||
|
||||
if quantizations:
|
||||
base_vram = min(q.vram_gb for q in quantizations)
|
||||
variants.append(ModelVariant(variant_size, base_vram, quantizations))
|
||||
|
||||
if not variants:
|
||||
continue
|
||||
return None
|
||||
|
||||
models[model_id] = Model(
|
||||
return Model(
|
||||
id=model_id,
|
||||
name=metadata["name"],
|
||||
description=metadata["description"],
|
||||
name=meta.get("name", model_id),
|
||||
description=meta.get("description", ""),
|
||||
variants=variants,
|
||||
priority=metadata["priority"],
|
||||
max_context=metadata["max_context"],
|
||||
priority=meta.get("priority", 100),
|
||||
max_context=meta.get("max_context", 8192)
|
||||
)
|
||||
|
||||
return models
|
||||
def list_models(self, use_mlx: bool = False) -> List[Model]:
|
||||
"""List all available models."""
|
||||
models = []
|
||||
for model_id, meta in self._metadata.items():
|
||||
# Skip non-dict entries (like _comment)
|
||||
if not isinstance(meta, dict):
|
||||
continue
|
||||
model = self.get_model(model_id, use_mlx)
|
||||
if model:
|
||||
models.append(model)
|
||||
|
||||
# Sort by priority
|
||||
return sorted(models, key=lambda m: m.priority)
|
||||
|
||||
def _get_quality_map(self, use_mlx: bool) -> Dict[str, str]:
|
||||
"""Get quality mapping for quantization types."""
|
||||
if use_mlx:
|
||||
return {
|
||||
"3bit": "fast",
|
||||
"4bit": "good",
|
||||
"5bit": "better",
|
||||
"6bit": "best",
|
||||
"8bit": "best",
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"q4_k_m": "good",
|
||||
"q5_k_m": "better",
|
||||
"q6_k": "best",
|
||||
}
|
||||
|
||||
|
||||
# Default models (GGUF format for llama.cpp)
|
||||
DEFAULT_MODELS = build_models(use_mlx=False)
|
||||
# Global registry instance
|
||||
_registry = ModelRegistry()
|
||||
|
||||
|
||||
def get_model(model_id: str, use_mlx: bool = False) -> Optional[Model]:
|
||||
"""Get a model by ID with platform-appropriate quantizations."""
|
||||
if use_mlx:
|
||||
models = build_models(use_mlx=True)
|
||||
return models.get(model_id)
|
||||
else:
|
||||
return DEFAULT_MODELS.get(model_id)
|
||||
"""Get a model by ID."""
|
||||
return _registry.get_model(model_id, use_mlx)
|
||||
|
||||
|
||||
def list_models(use_mlx: bool = False) -> List[Model]:
|
||||
"""List all available models sorted by priority."""
|
||||
if use_mlx:
|
||||
models = build_models(use_mlx=True)
|
||||
else:
|
||||
models = DEFAULT_MODELS
|
||||
return sorted(models.values(), key=lambda m: m.priority)
|
||||
"""List all available models."""
|
||||
return _registry.list_models(use_mlx)
|
||||
|
||||
|
||||
def get_model_hf_repo(model_id: str, variant: ModelVariant, quant: QuantizationConfig) -> str:
|
||||
"""Get the HuggingFace repository path for a model (GGUF format)."""
|
||||
repo_map = {
|
||||
"qwen2.5-coder": f"Qwen/Qwen2.5-Coder-{variant.size}-Instruct-GGUF",
|
||||
"deepseek-coder": f"TheBloke/deepseek-coder-{variant.size}-base-GGUF",
|
||||
"codellama": f"TheBloke/CodeLlama-{variant.size}-Instruct-GGUF",
|
||||
"llama-3.2": f"bartowski/Llama-3.2-{variant.size}-Instruct-GGUF",
|
||||
"phi-4": f"unsloth/phi-4-GGUF",
|
||||
"gemma-2": f"bartowski/gemma-2-{variant.size}-it-GGUF",
|
||||
"starcoder2": f"TheBloke/starcoder2-{variant.size}-GGUF",
|
||||
}
|
||||
return repo_map.get(model_id, "")
|
||||
|
||||
|
||||
def get_model_hf_repo_mlx(model_id: str, variant: ModelVariant, quant: QuantizationConfig) -> str:
|
||||
"""Get the HuggingFace repository path for MLX quantized models (Apple Silicon)."""
|
||||
# Map GGUF quantization names to MLX quantization names
|
||||
# MLX uses simple names: 3bit, 4bit, 8bit, not q4_k_m, q6_k, etc.
|
||||
gguf_to_mlx_quant = {
|
||||
"q3_k_m": "3bit",
|
||||
"q4_k_m": "4bit",
|
||||
"q4_k": "4bit",
|
||||
"q5_k_m": "5bit",
|
||||
"q5_k": "5bit",
|
||||
"q6_k": "6bit",
|
||||
"q8_0": "8bit",
|
||||
"q8": "8bit",
|
||||
}
|
||||
def get_model_hf_repo(model_id: str, variant: ModelVariant, quant: QuantizationConfig) -> Optional[str]:
|
||||
"""Get HuggingFace repository ID for a GGUF model.
|
||||
|
||||
# MLX quantized models are in mlx-community org with -{quant}bit suffix
|
||||
# Map base model names to mlx-community quantized versions
|
||||
mlx_repo_map = {
|
||||
"qwen2.5-coder": f"mlx-community/Qwen2.5-Coder-{variant.size.capitalize()}-Instruct",
|
||||
"deepseek-coder": f"mlx-community/deepseek-coder-{variant.size}-base",
|
||||
"codellama": f"mlx-community/CodeLlama-{variant.size}-Instruct",
|
||||
"llama-3.2": f"mlx-community/Llama-3.2-{variant.size}-Instruct",
|
||||
"phi-4": f"mlx-community/phi-4",
|
||||
"gemma-2": f"mlx-community/gemma-2-{variant.size}-it",
|
||||
"starcoder2": f"mlx-community/starcoder2-{variant.size}",
|
||||
}
|
||||
Args:
|
||||
model_id: Model identifier
|
||||
variant: Model variant (size)
|
||||
quant: Quantization config
|
||||
|
||||
Returns:
|
||||
HuggingFace repo ID (e.g., "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF") or None if unknown
|
||||
"""
|
||||
# Get the base repo from metadata
|
||||
if model_id not in _registry._metadata:
|
||||
return None
|
||||
|
||||
base_repo = mlx_repo_map.get(model_id, "")
|
||||
if base_repo and quant:
|
||||
# Convert GGUF quant name to MLX quant name
|
||||
mlx_quant = gguf_to_mlx_quant.get(quant.name, quant.name)
|
||||
# Append quantization suffix
|
||||
return f"{base_repo}-{mlx_quant}"
|
||||
return base_repo
|
||||
meta = _registry._metadata[model_id]
|
||||
if not isinstance(meta, dict):
|
||||
return None
|
||||
|
||||
base_repo = meta.get("hf_repo")
|
||||
if not base_repo:
|
||||
return None
|
||||
|
||||
# Convert variant size (e.g., "14b" to "-14B") and construct repo ID
|
||||
size_suffix = f"-{variant.size.upper()}"
|
||||
|
||||
# For GGUF, add -Instruct-GGUF suffix
|
||||
repo_id = f"{base_repo}{size_suffix}-Instruct-GGUF"
|
||||
|
||||
return repo_id
|
||||
|
||||
|
||||
def get_model_hf_repo_mlx(model_id: str, variant: ModelVariant, quant: QuantizationConfig) -> Optional[str]:
|
||||
"""Get HuggingFace repository ID for an MLX model.
|
||||
|
||||
Args:
|
||||
model_id: Model identifier
|
||||
variant: Model variant (size)
|
||||
quant: Quantization config
|
||||
|
||||
Returns:
|
||||
HuggingFace repo ID (e.g., "mlx-community/Qwen2.5-Coder-14B-Instruct-4bit") or None if unknown
|
||||
"""
|
||||
# Get the base repo from metadata
|
||||
if model_id not in _registry._metadata:
|
||||
return None
|
||||
|
||||
meta = _registry._metadata[model_id]
|
||||
if not isinstance(meta, dict):
|
||||
return None
|
||||
|
||||
base_repo = meta.get("hf_repo")
|
||||
if not base_repo:
|
||||
return None
|
||||
|
||||
# MLX models are typically in mlx-community namespace
|
||||
# Format: mlx-community/{ModelName}-{Size}-{Quantization}
|
||||
# For example: mlx-community/Qwen2.5-Coder-14B-Instruct-4bit
|
||||
|
||||
# Convert variant size (e.g., "14b" to "-14B")
|
||||
size_suffix = f"-{variant.size.upper()}"
|
||||
|
||||
# Add quantization suffix (e.g., "-4bit" for MLX quantization names)
|
||||
quant_suffix = f"-{quant.name}"
|
||||
|
||||
# Construct the full repo name
|
||||
model_name = base_repo.split('/')[-1] # Get just the model name, not the org
|
||||
repo_id = f"mlx-community/{model_name}{size_suffix}-Instruct{quant_suffix}"
|
||||
|
||||
return repo_id
|
||||
|
||||
|
||||
def get_model_filename(model_id: str, variant: ModelVariant, quant: QuantizationConfig) -> str:
|
||||
"""Get the GGUF filename for a model."""
|
||||
filename_map = {
|
||||
"qwen2.5-coder": f"qwen2.5-coder-{variant.size}-instruct-{quant.name}.gguf",
|
||||
"deepseek-coder": f"deepseek-coder-{variant.size}-base.{quant.name.upper()}.gguf",
|
||||
"codellama": f"codellama-{variant.size}-instruct.{quant.name.upper()}.gguf",
|
||||
"llama-3.2": f"Llama-3.2-{variant.size}-Instruct-{quant.name}.gguf",
|
||||
"phi-4": f"phi-4-{quant.name}.gguf",
|
||||
"gemma-2": f"gemma-2-{variant.size}-it-{quant.name}.gguf",
|
||||
"starcoder2": f"starcoder2-{variant.size}.{quant.name.upper()}.gguf",
|
||||
}
|
||||
return filename_map.get(model_id, f"{model_id}-{variant.size}-{quant.name}.gguf")
|
||||
"""Get the filename for a GGUF model file.
|
||||
|
||||
Args:
|
||||
model_id: Model identifier
|
||||
variant: Model variant (size)
|
||||
quant: Quantization config
|
||||
|
||||
Returns:
|
||||
GGUF filename (e.g., "qwen2.5-coder-14b-instruct-q4_k_m.gguf")
|
||||
"""
|
||||
# Extract model name from metadata
|
||||
if model_id not in _registry._metadata:
|
||||
meta = {"name": model_id, "hf_repo": model_id}
|
||||
else:
|
||||
meta = _registry._metadata[model_id]
|
||||
|
||||
if not isinstance(meta, dict):
|
||||
meta = {"name": model_id, "hf_repo": model_id}
|
||||
|
||||
# Use the base repo name or model name
|
||||
base_name = meta.get("hf_repo", meta.get("name", model_id))
|
||||
# Remove org prefix if present
|
||||
if '/' in base_name:
|
||||
base_name = base_name.split('/')[-1]
|
||||
|
||||
# Standard GGUF naming (all lowercase): {model}-{variant}-instruct-{quantization}.gguf
|
||||
# For example: qwen2.5-coder-14b-instruct-q4_k_m.gguf
|
||||
variant_size = f"-{variant.size.lower()}"
|
||||
quant_name = quant.name.lower()
|
||||
filename = f"{base_name.lower()}{variant_size}-instruct-{quant_name}.gguf"
|
||||
|
||||
return filename
|
||||
|
||||
+134
-290
@@ -1,9 +1,17 @@
|
||||
"""Model selection logic for Local Swarm."""
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional, List
|
||||
|
||||
from hardware.detector import HardwareProfile
|
||||
from models.registry import Model, ModelVariant, QuantizationConfig, list_models
|
||||
from models.memory_calculator import (
|
||||
calculate_memory_with_offload,
|
||||
get_available_memory_with_offload,
|
||||
calculate_max_instances
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -15,10 +23,10 @@ class ModelConfig:
|
||||
instances: int
|
||||
memory_per_instance_gb: float
|
||||
total_memory_gb: float
|
||||
context_size: int = 32768 # Context window in tokens (16K, 32K, 64K, 128K)
|
||||
offload_percent: float = 0.0 # Percentage of layers offloaded to RAM (0.0, 0.2, 0.5)
|
||||
vram_usage_gb: float = 0.0 # Actual VRAM usage per instance
|
||||
ram_usage_gb: float = 0.0 # System RAM usage per instance (when offloading)
|
||||
context_size: int = 32768
|
||||
offload_percent: float = 0.0
|
||||
vram_usage_gb: float = 0.0
|
||||
ram_usage_gb: float = 0.0
|
||||
|
||||
def __post_init__(self):
|
||||
"""Ensure default values are set if not provided."""
|
||||
@@ -42,147 +50,24 @@ class ModelConfig:
|
||||
return f"{self.model.name} {self.variant.size} ({self.quantization.name}, {self.context_size//1000}K ctx{offload_str})"
|
||||
|
||||
|
||||
# Configuration constraints
|
||||
MIN_INSTANCES = 1 # Allow 1 instance (needed for Apple Silicon MLX)
|
||||
MAX_INSTANCES = 8
|
||||
OPTIMAL_MAX_INSTANCES = 5 # Sweet spot for consensus (85-90% benefit)
|
||||
MEMORY_OVERHEAD_FACTOR = 0.95 # Leave 5% buffer
|
||||
# Load configuration from JSON
|
||||
_config_path = Path(__file__).parent.parent.parent / "config" / "models" / "selector_config.json"
|
||||
_config = {}
|
||||
if _config_path.exists():
|
||||
with open(_config_path, 'r') as f:
|
||||
_config = json.load(f)
|
||||
|
||||
# Apple Silicon MLX constraints - MLX uses GPU efficiently with 1 worker
|
||||
MLX_MAX_INSTANCES = 1 # MLX handles all GPU resources in single instance
|
||||
# Extract constraints
|
||||
_constraints = _config.get("constraints", {})
|
||||
MIN_INSTANCES = _constraints.get("min_instances", 1)
|
||||
MAX_INSTANCES = _constraints.get("max_instances", 8)
|
||||
OPTIMAL_MAX_INSTANCES = _constraints.get("optimal_max_instances", 5)
|
||||
MEMORY_OVERHEAD_FACTOR = _constraints.get("memory_overhead_factor", 0.95)
|
||||
MLX_MAX_INSTANCES = _constraints.get("mlx_max_instances", 1)
|
||||
|
||||
|
||||
# Context window options
|
||||
CONTEXT_OPTIONS = {
|
||||
16384: "16K tokens",
|
||||
32768: "32K tokens (default)",
|
||||
65536: "64K tokens",
|
||||
131072: "128K tokens"
|
||||
}
|
||||
|
||||
# Offloading options
|
||||
OFFLOAD_OPTIONS = {
|
||||
0.0: "No offload (default) - 100% GPU",
|
||||
0.2: "20% offload - 80% GPU, 20% RAM",
|
||||
0.5: "50% offload - 50% GPU, 50% RAM"
|
||||
}
|
||||
|
||||
|
||||
def calculate_context_memory(context_size: int, quantization_bits: int = 4) -> float:
|
||||
"""
|
||||
Calculate additional memory needed for KV cache based on context size.
|
||||
|
||||
Args:
|
||||
context_size: Number of tokens in context window
|
||||
quantization_bits: Quantization bits (4 for Q4, 5 for Q5, etc.)
|
||||
|
||||
Returns:
|
||||
Additional VRAM needed in GB
|
||||
"""
|
||||
# KV cache memory per token: 2 * num_layers * hidden_dim * bytes_per_param
|
||||
# Rough estimate: ~0.5MB per 1K tokens for 4-bit quantization
|
||||
bytes_per_token = (quantization_bits / 8) * 0.5 # 0.5 MB base per token at fp16
|
||||
memory_mb = (context_size / 1000) * bytes_per_token * 1000
|
||||
return memory_mb / 1024 # Convert to GB
|
||||
|
||||
|
||||
def calculate_memory_with_offload(
|
||||
base_vram_gb: float,
|
||||
context_size: int,
|
||||
offload_percent: float,
|
||||
quantization_bits: int = 4
|
||||
) -> tuple[float, float]:
|
||||
"""
|
||||
Calculate VRAM and RAM usage with offloading.
|
||||
|
||||
Args:
|
||||
base_vram_gb: Base model VRAM without context
|
||||
context_size: Context window size in tokens
|
||||
offload_percent: Percentage of model offloaded to RAM (0.0-1.0)
|
||||
quantization_bits: Quantization precision
|
||||
|
||||
Returns:
|
||||
(vram_usage_gb, ram_usage_gb)
|
||||
"""
|
||||
# Context memory (KV cache) - always in VRAM for speed
|
||||
context_memory = calculate_context_memory(context_size, quantization_bits)
|
||||
|
||||
# Model weights split between GPU and RAM
|
||||
gpu_model_memory = base_vram_gb * (1 - offload_percent)
|
||||
ram_model_memory = base_vram_gb * offload_percent
|
||||
|
||||
# Context cache stays in VRAM for performance
|
||||
vram_total = gpu_model_memory + context_memory
|
||||
ram_total = ram_model_memory
|
||||
|
||||
return vram_total, ram_total
|
||||
|
||||
|
||||
def get_available_memory_with_offload(
|
||||
hardware: HardwareProfile,
|
||||
offload_percent: float
|
||||
) -> tuple[float, float]:
|
||||
"""
|
||||
Get available GPU VRAM and system RAM considering offloading.
|
||||
|
||||
Args:
|
||||
hardware: Hardware profile
|
||||
offload_percent: Offloading percentage
|
||||
|
||||
Returns:
|
||||
(available_vram_gb, available_ram_gb)
|
||||
"""
|
||||
if hardware.gpu and not hardware.is_apple_silicon:
|
||||
# External GPU - use GPU VRAM + potentially some system RAM
|
||||
available_vram = hardware.gpu.vram_gb * 0.9 # 10% buffer
|
||||
available_ram = hardware.ram_gb * 0.5 * offload_percent # Portion of RAM for offload
|
||||
elif hardware.is_apple_silicon:
|
||||
# Apple Silicon - unified memory
|
||||
# Use full available memory (RAM - 4GB), not just 50%
|
||||
available_total = hardware.available_memory_gb
|
||||
available_vram = available_total * (1 - offload_percent)
|
||||
available_ram = available_total * offload_percent
|
||||
else:
|
||||
# CPU only - use full available memory (RAM - 4GB safety)
|
||||
# On CPU-only, there's no VRAM/RAM split, just system RAM
|
||||
available_vram = hardware.available_memory_gb # Use the new limit
|
||||
available_ram = 0
|
||||
|
||||
return available_vram, available_ram
|
||||
|
||||
|
||||
def calculate_max_instances(available_memory_gb: float, memory_per_instance: float, optimal: bool = True) -> int:
|
||||
"""
|
||||
Calculate number of instances based on available memory.
|
||||
|
||||
Args:
|
||||
available_memory_gb: Available memory in GB
|
||||
memory_per_instance: Memory required per instance in GB
|
||||
optimal: If True, cap at OPTIMAL_MAX_INSTANCES (3-5 sweet spot).
|
||||
If False, return maximum possible (up to MAX_INSTANCES).
|
||||
|
||||
Returns:
|
||||
Recommended number of instances (2-5 for optimal, 2-8 for max)
|
||||
"""
|
||||
effective_memory = available_memory_gb * MEMORY_OVERHEAD_FACTOR
|
||||
max_possible = int(effective_memory // memory_per_instance)
|
||||
|
||||
if optimal:
|
||||
# Use optimal range: 2-5 instances (research-backed sweet spot)
|
||||
# 3-5 instances gives 85-90% of consensus benefit
|
||||
# More than 5 has diminishing returns
|
||||
if max_possible >= OPTIMAL_MAX_INSTANCES:
|
||||
return OPTIMAL_MAX_INSTANCES # Cap at sweet spot
|
||||
elif max_possible >= 3:
|
||||
return max_possible # Use 3-4 if memory allows
|
||||
else:
|
||||
# Only return MIN_INSTANCES if memory actually fits that many
|
||||
if max_possible >= MIN_INSTANCES:
|
||||
return max_possible # Return what actually fits, not MIN_INSTANCES
|
||||
return max(max_possible, 1)
|
||||
else:
|
||||
# Return absolute maximum (for users who explicitly want more)
|
||||
return max(MIN_INSTANCES, min(max_possible, MAX_INSTANCES))
|
||||
# Context and offload options
|
||||
CONTEXT_OPTIONS = {int(k): v for k, v in _config.get("context_options", {}).items()}
|
||||
OFFLOAD_OPTIONS = {float(k): v for k, v in _config.get("offload_options", {}).items()}
|
||||
|
||||
|
||||
def select_optimal_model(
|
||||
@@ -191,125 +76,118 @@ def select_optimal_model(
|
||||
force_instances: Optional[int] = None,
|
||||
context_size: int = 32768,
|
||||
offload_percent: float = 0.0,
|
||||
use_mlx: bool = False
|
||||
use_mlx: Optional[bool] = None
|
||||
) -> Optional[ModelConfig]:
|
||||
"""
|
||||
Select the optimal model configuration for given hardware.
|
||||
"""Select the optimal model configuration for given hardware."""
|
||||
# Auto-detect MLX usage for Apple Silicon if not explicitly set
|
||||
if use_mlx is None:
|
||||
use_mlx = hardware.is_apple_silicon
|
||||
|
||||
Args:
|
||||
hardware: Hardware profile
|
||||
preferred_model: Optional model ID to force (e.g., "qwen2.5-coder")
|
||||
force_instances: Optional number of instances to force
|
||||
context_size: Context window size in tokens (default: 32768)
|
||||
offload_percent: Portion of model to offload to RAM (0.0-1.0)
|
||||
use_mlx: Whether to use MLX format models (Apple Silicon)
|
||||
available_vram, _ = get_available_memory_with_offload(hardware, offload_percent)
|
||||
|
||||
Returns:
|
||||
ModelConfig or None if no suitable model found
|
||||
"""
|
||||
# Auto-detect MLX if on Apple Silicon and not explicitly set
|
||||
if use_mlx is None and hardware.is_apple_silicon:
|
||||
use_mlx = True
|
||||
|
||||
# Get available memory considering offloading
|
||||
available_vram, available_ram = get_available_memory_with_offload(hardware, offload_percent)
|
||||
|
||||
# Get models to try (with appropriate quantizations)
|
||||
# Note: Don't check available quantizations here (too slow for menu rendering)
|
||||
# Only check when user is actually browsing or selecting custom config
|
||||
if preferred_model:
|
||||
from models.registry import get_model
|
||||
|
||||
# Parse preferred_model - can be:
|
||||
# - "qwen2.5-coder" (just model ID)
|
||||
# - "qwen2.5-coder:7b:4bit" (model:size:quant format)
|
||||
if ':' in preferred_model:
|
||||
# Full format: model_id:size:quant
|
||||
parts = preferred_model.split(':')
|
||||
if len(parts) >= 3:
|
||||
model_id = parts[0]
|
||||
preferred_size = parts[1]
|
||||
preferred_quant = parts[2]
|
||||
else:
|
||||
model_id = preferred_model
|
||||
preferred_size = None
|
||||
preferred_quant = None
|
||||
else:
|
||||
model_id = preferred_model
|
||||
preferred_size = None
|
||||
preferred_quant = None
|
||||
|
||||
preferred = get_model(model_id, use_mlx=use_mlx)
|
||||
if preferred and preferred_size and preferred_quant:
|
||||
# Try to find specific variant and quantization
|
||||
found_variant = None
|
||||
found_quant = None
|
||||
|
||||
for variant in preferred.variants:
|
||||
if variant.size.lower() == preferred_size.lower():
|
||||
found_variant = variant
|
||||
for quant in variant.quantizations:
|
||||
if quant.name.lower() == preferred_quant.lower():
|
||||
found_quant = quant
|
||||
break
|
||||
break
|
||||
|
||||
if found_variant and found_quant:
|
||||
# Found the specific variant/quant, check if it fits
|
||||
memory_needed = found_quant.vram_gb
|
||||
if memory_needed <= available_vram:
|
||||
# Calculate instances
|
||||
if force_instances:
|
||||
instances = force_instances
|
||||
else:
|
||||
instances = calculate_max_instances(available_vram, found_quant.vram_gb, optimal=True)
|
||||
# Cap at 1 for MLX
|
||||
if use_mlx:
|
||||
instances = min(instances, MLX_MAX_INSTANCES)
|
||||
return ModelConfig(
|
||||
model=preferred,
|
||||
variant=found_variant,
|
||||
quantization=found_quant,
|
||||
instances=instances,
|
||||
memory_per_instance_gb=found_quant.vram_gb,
|
||||
total_memory_gb=found_quant.vram_gb * instances,
|
||||
context_size=context_size,
|
||||
offload_percent=offload_percent,
|
||||
vram_usage_gb=found_quant.vram_gb,
|
||||
ram_usage_gb=0.0
|
||||
)
|
||||
else:
|
||||
# Specific config requested but doesn't fit - return None
|
||||
print(f"\n⚠️ Requested model {model_id}:{preferred_size}:{preferred_quant} requires {memory_needed:.1f}GB but only {available_vram:.1f}GB available")
|
||||
return None
|
||||
else:
|
||||
# Specific config requested but not found
|
||||
print(f"\n⚠️ Model configuration not found: {model_id}:{preferred_size}:{preferred_quant}")
|
||||
print(f" Available sizes for {model_id}: {[v.size for v in preferred.variants]}")
|
||||
return None
|
||||
|
||||
models = [preferred] if preferred else []
|
||||
else:
|
||||
models = list_models(use_mlx=use_mlx)
|
||||
config = _handle_preferred_model(
|
||||
preferred_model, hardware, available_vram, force_instances,
|
||||
context_size, offload_percent, use_mlx
|
||||
)
|
||||
if config:
|
||||
return config
|
||||
|
||||
# Note: On Apple Silicon with MLX, multiple instances work fine in sequential mode
|
||||
# The swarm manager will handle sequential execution to avoid GPU conflicts
|
||||
models = list_models(use_mlx=use_mlx)
|
||||
|
||||
# Try each model in priority order
|
||||
for model in models:
|
||||
config = _try_model_with_context(model, available_vram, force_instances, context_size, offload_percent, use_mlx)
|
||||
if config:
|
||||
return config
|
||||
|
||||
# If nothing fits, try smallest variant of first model
|
||||
if models:
|
||||
smallest_config = _try_smallest_variant_with_context(models[0], available_vram, force_instances, context_size, offload_percent, use_mlx)
|
||||
if smallest_config:
|
||||
return smallest_config
|
||||
return _try_smallest_variant_with_context(models[0], available_vram, force_instances, context_size, offload_percent, use_mlx)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _handle_preferred_model(
|
||||
preferred_model: str,
|
||||
hardware: HardwareProfile,
|
||||
available_vram: float,
|
||||
force_instances: Optional[int],
|
||||
context_size: int,
|
||||
offload_percent: float,
|
||||
use_mlx: bool
|
||||
) -> Optional[ModelConfig]:
|
||||
"""Handle preferred model selection."""
|
||||
from models.registry import get_model
|
||||
|
||||
model_id = preferred_model
|
||||
preferred_size = None
|
||||
preferred_quant = None
|
||||
|
||||
if ':' in preferred_model:
|
||||
parts = preferred_model.split(':')
|
||||
if len(parts) >= 3:
|
||||
model_id = parts[0]
|
||||
preferred_size = parts[1]
|
||||
preferred_quant = parts[2]
|
||||
|
||||
preferred = get_model(model_id, use_mlx=use_mlx)
|
||||
if not preferred:
|
||||
return None
|
||||
|
||||
if preferred_size and preferred_quant:
|
||||
return _try_specific_config(
|
||||
preferred, preferred_size, preferred_quant, available_vram,
|
||||
force_instances, context_size, offload_percent, use_mlx
|
||||
)
|
||||
|
||||
models = [preferred]
|
||||
for model in models:
|
||||
config = _try_model_with_context(model, available_vram, force_instances, context_size, offload_percent, use_mlx)
|
||||
if config:
|
||||
return config
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _try_specific_config(
|
||||
model: Model,
|
||||
preferred_size: str,
|
||||
preferred_quant: str,
|
||||
available_vram: float,
|
||||
force_instances: Optional[int],
|
||||
context_size: int,
|
||||
offload_percent: float,
|
||||
use_mlx: bool
|
||||
) -> Optional[ModelConfig]:
|
||||
"""Try to use a specific model configuration."""
|
||||
for variant in model.variants:
|
||||
if variant.size.lower() == preferred_size.lower():
|
||||
for quant in variant.quantizations:
|
||||
if quant.name.lower() == preferred_quant.lower():
|
||||
if quant.vram_gb <= available_vram:
|
||||
instances = force_instances or calculate_max_instances(available_vram, quant.vram_gb, optimal=True)
|
||||
if use_mlx:
|
||||
instances = min(instances, MLX_MAX_INSTANCES)
|
||||
|
||||
return ModelConfig(
|
||||
model=model,
|
||||
variant=variant,
|
||||
quantization=quant,
|
||||
instances=instances,
|
||||
memory_per_instance_gb=quant.vram_gb,
|
||||
total_memory_gb=quant.vram_gb * instances,
|
||||
context_size=context_size,
|
||||
offload_percent=offload_percent,
|
||||
vram_usage_gb=quant.vram_gb,
|
||||
ram_usage_gb=0.0
|
||||
)
|
||||
else:
|
||||
print(f"\n⚠️ Requested model requires {quant.vram_gb:.1f}GB but only {available_vram:.1f}GB available")
|
||||
return None
|
||||
|
||||
print(f"\n⚠️ Model configuration not found: {model.id}:{preferred_size}:{preferred_quant}")
|
||||
return None
|
||||
|
||||
|
||||
def _try_model_with_context(
|
||||
model: Model,
|
||||
available_vram: float,
|
||||
@@ -319,9 +197,7 @@ def _try_model_with_context(
|
||||
use_mlx: bool = False
|
||||
) -> Optional[ModelConfig]:
|
||||
"""Try to fit a model in available memory with context and offloading."""
|
||||
# Try variants from largest to smallest
|
||||
for variant in sorted(model.variants, key=lambda v: v.base_vram_gb, reverse=True):
|
||||
# Try quantizations from best to fastest
|
||||
sorted_quants = sorted(
|
||||
variant.quantizations,
|
||||
key=lambda q: (['fast', 'good', 'better', 'best'].index(q.quality), -q.vram_gb),
|
||||
@@ -329,10 +205,7 @@ def _try_model_with_context(
|
||||
)
|
||||
|
||||
for quant in sorted_quants:
|
||||
# Calculate memory with context and offloading
|
||||
# Extract quantization bits from name (e.g., "4bit" -> 4, "q4_k_m" -> 4)
|
||||
if 'bit' in quant.name:
|
||||
# MLX format: "4bit", "3bit", etc.
|
||||
quantization_bits = int(quant.name.replace('bit', ''))
|
||||
elif 'q4' in quant.name:
|
||||
quantization_bits = 4
|
||||
@@ -341,37 +214,23 @@ def _try_model_with_context(
|
||||
elif 'q6' in quant.name:
|
||||
quantization_bits = 6
|
||||
else:
|
||||
quantization_bits = 4 # Default fallback
|
||||
quantization_bits = 4
|
||||
|
||||
vram_per_instance, ram_per_instance = calculate_memory_with_offload(
|
||||
quant.vram_gb, context_size, offload_percent, quantization_bits
|
||||
)
|
||||
|
||||
# Check if at least MIN_INSTANCES can fit
|
||||
min_needed = vram_per_instance * MIN_INSTANCES
|
||||
if min_needed > available_vram:
|
||||
if vram_per_instance * MIN_INSTANCES > available_vram:
|
||||
continue
|
||||
|
||||
# Calculate instances
|
||||
if force_instances:
|
||||
instances = force_instances
|
||||
if not use_mlx: # On non-Mac, check if all instances fit in VRAM
|
||||
total_needed = vram_per_instance * instances
|
||||
if total_needed > available_vram:
|
||||
continue
|
||||
if not use_mlx and vram_per_instance * instances > available_vram:
|
||||
continue
|
||||
else:
|
||||
# On Mac with MLX (use_mlx=True), use 3 responses by default
|
||||
# On other platforms, calculate based on VRAM
|
||||
if use_mlx:
|
||||
instances = 1 # DEBUG: Changed from 3 to 1 for faster testing
|
||||
else:
|
||||
instances = calculate_max_instances(available_vram, vram_per_instance)
|
||||
instances = 1 if use_mlx else calculate_max_instances(available_vram, vram_per_instance)
|
||||
|
||||
# On Mac with seed variation, memory doesn't multiply
|
||||
if use_mlx:
|
||||
total_memory = vram_per_instance + ram_per_instance
|
||||
else:
|
||||
total_memory = (vram_per_instance + ram_per_instance) * instances
|
||||
total_memory = vram_per_instance + ram_per_instance if use_mlx else (vram_per_instance + ram_per_instance) * instances
|
||||
|
||||
return ModelConfig(
|
||||
model=model,
|
||||
@@ -401,38 +260,25 @@ def _try_smallest_variant_with_context(
|
||||
if not model.variants:
|
||||
return None
|
||||
|
||||
# Get smallest variant
|
||||
smallest_variant = min(model.variants, key=lambda v: v.base_vram_gb)
|
||||
|
||||
# Get smallest quantization
|
||||
if not smallest_variant.quantizations:
|
||||
return None
|
||||
|
||||
smallest_quant = min(smallest_variant.quantizations, key=lambda q: q.vram_gb)
|
||||
|
||||
# Calculate memory with context and offloading
|
||||
quantization_bits = 4 if 'q4' in smallest_quant.name else (5 if 'q5' in smallest_quant.name else 6)
|
||||
vram_per_instance, ram_per_instance = calculate_memory_with_offload(
|
||||
smallest_quant.vram_gb, context_size, offload_percent, quantization_bits
|
||||
)
|
||||
|
||||
# Check if even this fits
|
||||
if vram_per_instance > available_vram:
|
||||
return None
|
||||
|
||||
# On Mac with MLX, use 3 responses by default
|
||||
if use_mlx:
|
||||
instances = force_instances or 1 # DEBUG: Changed from 3 to 1
|
||||
else:
|
||||
instances = force_instances or calculate_max_instances(available_vram, vram_per_instance)
|
||||
instances = force_instances or (1 if use_mlx else calculate_max_instances(available_vram, vram_per_instance))
|
||||
instances = max(instances, 1)
|
||||
|
||||
# On Mac with seed variation, memory doesn't multiply
|
||||
if use_mlx:
|
||||
total_memory = vram_per_instance + ram_per_instance
|
||||
else:
|
||||
total_memory = (vram_per_instance + ram_per_instance) * instances
|
||||
|
||||
total_memory = vram_per_instance + ram_per_instance if use_mlx else (vram_per_instance + ram_per_instance) * instances
|
||||
|
||||
return ModelConfig(
|
||||
model=model,
|
||||
variant=smallest_variant,
|
||||
@@ -456,7 +302,6 @@ def format_recommendation(config: ModelConfig, hardware: HardwareProfile) -> str
|
||||
f"GPU VRAM per instance: {config.vram_usage_gb:.1f} GB",
|
||||
]
|
||||
|
||||
# Show RAM usage if offloading
|
||||
if config.offload_percent > 0:
|
||||
lines.append(f"System RAM per instance: {config.ram_usage_gb:.1f} GB")
|
||||
|
||||
@@ -464,8 +309,7 @@ def format_recommendation(config: ModelConfig, hardware: HardwareProfile) -> str
|
||||
f"Total memory used: {config.total_memory_gb:.1f} GB",
|
||||
f"Available memory: {hardware.available_memory_gb:.1f} GB",
|
||||
])
|
||||
|
||||
# Add instance count explanation
|
||||
|
||||
if config.instances == OPTIMAL_MAX_INSTANCES:
|
||||
lines.append(f"Note: Using optimal instance count (3-5 = 85-90% consensus benefit)")
|
||||
elif config.instances == MIN_INSTANCES:
|
||||
|
||||
@@ -6,10 +6,43 @@ Uses mDNS/Bonjour to discover other Local Swarm instances on the local network.
|
||||
import socket
|
||||
import asyncio
|
||||
from typing import Dict, List, Optional, Any
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
@dataclass
|
||||
class PeerMetrics:
|
||||
"""Metrics for tracking peer performance."""
|
||||
total_requests: int = 0
|
||||
successful_requests: int = 0
|
||||
failed_requests: int = 0
|
||||
total_latency_ms: float = 0.0
|
||||
avg_latency_ms: float = 0.0
|
||||
last_error: Optional[str] = None
|
||||
last_error_time: Optional[datetime] = None
|
||||
|
||||
@property
|
||||
def success_rate(self) -> float:
|
||||
"""Calculate success rate (0.0 to 1.0)."""
|
||||
if self.total_requests == 0:
|
||||
return 1.0
|
||||
return self.successful_requests / self.total_requests
|
||||
|
||||
def record_success(self, latency_ms: float):
|
||||
"""Record a successful request."""
|
||||
self.total_requests += 1
|
||||
self.successful_requests += 1
|
||||
self.total_latency_ms += latency_ms
|
||||
self.avg_latency_ms = self.total_latency_ms / self.successful_requests
|
||||
|
||||
def record_failure(self, error: str):
|
||||
"""Record a failed request."""
|
||||
self.total_requests += 1
|
||||
self.failed_requests += 1
|
||||
self.last_error = error
|
||||
self.last_error_time = datetime.now()
|
||||
|
||||
|
||||
@dataclass
|
||||
class PeerInfo:
|
||||
"""Information about a peer swarm."""
|
||||
@@ -21,6 +54,8 @@ class PeerInfo:
|
||||
model_id: str
|
||||
hardware_summary: str
|
||||
last_seen: datetime
|
||||
timeout_seconds: float = 60.0 # Configurable timeout per peer
|
||||
metrics: PeerMetrics = field(default_factory=PeerMetrics)
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
@@ -100,6 +135,8 @@ class SwarmDiscovery:
|
||||
await asyncio.to_thread(self._zeroconf.register_service, self._info)
|
||||
print(f" ✓ Advertising on mDNS: {service_name}")
|
||||
print(f" IP: {ip}:{self.listen_port}")
|
||||
print(f" Service type: {self.SERVICE_TYPE}")
|
||||
print(f" Properties: instances={swarm_info.get('instances', 0)}, model={swarm_info.get('model_id', 'unknown')}")
|
||||
|
||||
except ImportError:
|
||||
print(" ⚠️ zeroconf not installed, skipping mDNS advertising")
|
||||
@@ -117,6 +154,10 @@ class SwarmDiscovery:
|
||||
self._async_zeroconf = AsyncZeroconf()
|
||||
self._zeroconf = self._async_zeroconf.zeroconf
|
||||
|
||||
# Store event loop reference for callbacks
|
||||
self._loop = asyncio.get_event_loop()
|
||||
print(f" Event loop: {self._loop}")
|
||||
|
||||
# Create async browser (passes the underlying Zeroconf instance)
|
||||
self._browser = AsyncServiceBrowser(
|
||||
self._zeroconf,
|
||||
@@ -125,6 +166,7 @@ class SwarmDiscovery:
|
||||
)
|
||||
|
||||
print(f" ✓ Listening for peers on {self.SERVICE_TYPE}")
|
||||
print(f" Will discover peers advertising on same network")
|
||||
self._running = True
|
||||
|
||||
except ImportError:
|
||||
@@ -136,16 +178,23 @@ class SwarmDiscovery:
|
||||
"""Handle mDNS service state changes (called from zeroconf background thread)."""
|
||||
from zeroconf import ServiceStateChange
|
||||
|
||||
print(f" [mDNS] Service state change: {name} -> {state_change.name}")
|
||||
|
||||
if state_change == ServiceStateChange.Added:
|
||||
print(f" [mDNS] Service added: {name}")
|
||||
# Schedule coroutine on the event loop from this background thread
|
||||
if self._loop is not None and self._loop.is_running():
|
||||
print(f" [mDNS] Scheduling peer addition...")
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self._add_peer(zeroconf, service_type, name),
|
||||
self._loop
|
||||
)
|
||||
else:
|
||||
print(f" [mDNS] Warning: Event loop not available")
|
||||
elif state_change == ServiceStateChange.Removed:
|
||||
# Service removed
|
||||
peer_key = name.replace(f".{self.SERVICE_TYPE}", "")
|
||||
print(f" [mDNS] Service removed: {peer_key}")
|
||||
if peer_key in self.peers:
|
||||
del self.peers[peer_key]
|
||||
print(f" 👋 Peer left: {peer_key}")
|
||||
@@ -292,13 +341,13 @@ class SwarmDiscovery:
|
||||
finally:
|
||||
s.close()
|
||||
|
||||
# Verify it's the correct private IP (192.168.x.x only for this network)
|
||||
is_private = ip.startswith('192.168.')
|
||||
|
||||
if is_private:
|
||||
# Only bind to 192.168.x.x as requested
|
||||
if ip.startswith('192.168.'):
|
||||
print(f" ✓ Using IP: {ip}")
|
||||
return ip
|
||||
else:
|
||||
print(f" ⚠️ IP {ip} is not private, using localhost")
|
||||
print(f" ⚠️ IP {ip} is not 192.168.x.x, using localhost")
|
||||
print(f" Federation requires 192.168.x.x network")
|
||||
return '127.0.0.1'
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Error detecting IP: {e}")
|
||||
|
||||
+196
-95
@@ -5,7 +5,7 @@ Handles communication between peer swarms for distributed consensus.
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from typing import List, Optional, Dict, Any
|
||||
from typing import List, Optional, Dict, Any, Tuple
|
||||
from dataclasses import dataclass
|
||||
|
||||
from network.discovery import PeerInfo
|
||||
@@ -20,6 +20,8 @@ class PeerVote:
|
||||
confidence: float
|
||||
latency_ms: float
|
||||
worker_count: int
|
||||
tokens_per_second: float = 0.0
|
||||
tokens_generated: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -29,12 +31,14 @@ class FederationResult:
|
||||
local_confidence: float
|
||||
peer_votes: List[PeerVote]
|
||||
strategy: str
|
||||
winner: str = "" # Name of the winning node ("local" or peer name)
|
||||
global_tokens_per_second: float = 0.0 # Includes sync + voting overhead
|
||||
|
||||
|
||||
class FederationClient:
|
||||
"""Client for communicating with peer swarms."""
|
||||
|
||||
def __init__(self, timeout: float = 30.0):
|
||||
def __init__(self, timeout: float = 60.0):
|
||||
"""
|
||||
Initialize federation client.
|
||||
|
||||
@@ -79,42 +83,58 @@ class FederationClient:
|
||||
Returns:
|
||||
PeerVote or None if request failed
|
||||
"""
|
||||
request_start = time.time()
|
||||
# Use peer-specific timeout if available, otherwise use default
|
||||
timeout = getattr(peer, 'timeout_seconds', self.timeout)
|
||||
|
||||
try:
|
||||
import aiohttp
|
||||
|
||||
session = await self._get_session()
|
||||
# Create session with peer-specific timeout
|
||||
session_timeout = aiohttp.ClientTimeout(total=timeout)
|
||||
async with aiohttp.ClientSession(timeout=session_timeout) as session:
|
||||
url = f"{peer.api_url}/v1/federation/vote"
|
||||
payload = {
|
||||
"prompt": prompt,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature,
|
||||
"request_id": f"fed_{time.time()}"
|
||||
}
|
||||
|
||||
url = f"{peer.api_url}/v1/federation/vote"
|
||||
payload = {
|
||||
"prompt": prompt,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature,
|
||||
"request_id": f"fed_{time.time()}"
|
||||
}
|
||||
print(f" → Sending request to {url} (timeout: {timeout}s)")
|
||||
async with session.post(url, json=payload) as resp:
|
||||
print(f" ← Got response {resp.status} from {peer.name}")
|
||||
if resp.status != 200:
|
||||
print(f" ✗ Peer {peer.name} returned status {resp.status}")
|
||||
peer.metrics.record_failure(f"HTTP {resp.status}")
|
||||
return None
|
||||
|
||||
print(f" → Sending request to {url}")
|
||||
async with session.post(url, json=payload) as resp:
|
||||
print(f" ← Got response {resp.status} from {peer.name}")
|
||||
if resp.status != 200:
|
||||
print(f" ✗ Peer {peer.name} returned status {resp.status}")
|
||||
return None
|
||||
data = await resp.json()
|
||||
latency_ms = (time.time() - request_start) * 1000
|
||||
print(f" ✓ Peer {peer.name} responded successfully ({latency_ms:.0f}ms)")
|
||||
|
||||
# Record success metrics
|
||||
peer.metrics.record_success(latency_ms)
|
||||
|
||||
data = await resp.json()
|
||||
print(f" ✓ Peer {peer.name} responded successfully")
|
||||
|
||||
return PeerVote(
|
||||
peer_name=peer.name,
|
||||
response_text=data.get("response", ""),
|
||||
confidence=data.get("confidence", 0.5),
|
||||
latency_ms=data.get("latency_ms", 0),
|
||||
worker_count=data.get("worker_count", 0)
|
||||
)
|
||||
return PeerVote(
|
||||
peer_name=peer.name,
|
||||
response_text=data.get("response", ""),
|
||||
confidence=data.get("confidence", 0.5),
|
||||
latency_ms=data.get("latency_ms", latency_ms),
|
||||
worker_count=data.get("worker_count", 0),
|
||||
tokens_per_second=data.get("tokens_per_second", 0.0),
|
||||
tokens_generated=data.get("tokens_generated", 0)
|
||||
)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
print(f" ⚠️ Peer {peer.name} timed out (>{self.timeout}s)")
|
||||
error_msg = f"Timeout ({timeout}s)"
|
||||
print(f" ⚠️ Peer {peer.name} {error_msg}")
|
||||
peer.metrics.record_failure(error_msg)
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Error contacting peer {peer.name}: {e}")
|
||||
error_msg = str(e)
|
||||
print(f" ⚠️ Error contacting peer {peer.name}: {error_msg}")
|
||||
peer.metrics.record_failure(error_msg)
|
||||
return None
|
||||
|
||||
async def health_check(self, peer: PeerInfo) -> bool:
|
||||
@@ -172,6 +192,8 @@ class FederatedSwarm:
|
||||
) -> FederationResult:
|
||||
"""
|
||||
Generate with federation across peer swarms.
|
||||
|
||||
Optimized: Runs local and peer generation in parallel for maximum speed.
|
||||
|
||||
Args:
|
||||
prompt: Input prompt
|
||||
@@ -182,74 +204,131 @@ class FederatedSwarm:
|
||||
Returns:
|
||||
FederationResult with final response
|
||||
"""
|
||||
# Phase 1: Local generation and consensus
|
||||
print(f" 🏠 Local swarm generating...")
|
||||
local_result = await self.local_swarm.generate(
|
||||
peers = self.discovery.get_peers()
|
||||
|
||||
if len(peers) == 0:
|
||||
if min_peers > 0:
|
||||
raise RuntimeError(f"Federation requires {min_peers} peers, but none found")
|
||||
|
||||
# Solo mode - just run local generation
|
||||
print(f" 🏠 Solo mode - local swarm generating...")
|
||||
solo_start_time = time.time()
|
||||
local_result = await self.local_swarm.generate(
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
use_consensus=True
|
||||
)
|
||||
solo_end_time = time.time()
|
||||
total_elapsed = solo_end_time - solo_start_time
|
||||
tokens_generated = local_result.selected_response.tokens_generated
|
||||
global_tps = tokens_generated / total_elapsed if total_elapsed > 0 else 0.0
|
||||
|
||||
print(f"\n 📊 Global Performance:")
|
||||
print(f" Total tokens: {tokens_generated}")
|
||||
print(f" Total time: {total_elapsed:.2f}s")
|
||||
print(f" Global speed: {global_tps:.1f} t/s")
|
||||
|
||||
return FederationResult(
|
||||
final_response=local_result.selected_response.text,
|
||||
local_confidence=local_result.confidence,
|
||||
peer_votes=[],
|
||||
strategy="solo",
|
||||
global_tokens_per_second=global_tps
|
||||
)
|
||||
|
||||
# Parallel generation: Local swarm AND peers generate simultaneously
|
||||
print(f" 🏠 Local swarm AND {len(peers)} peer(s) generating in parallel...")
|
||||
|
||||
# Track timing for global t/sec calculation (includes sync + voting overhead)
|
||||
federation_start_time = time.time()
|
||||
total_tokens_generated = 0
|
||||
|
||||
# Start local generation
|
||||
local_task = self.local_swarm.generate(
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
use_consensus=True
|
||||
)
|
||||
|
||||
local_best = local_result.selected_response
|
||||
local_confidence = local_result.confidence
|
||||
|
||||
print(f" ✓ Local best (confidence: {local_confidence:.2f})")
|
||||
|
||||
# Phase 2: Collect peer votes
|
||||
peers = self.discovery.get_peers()
|
||||
|
||||
if len(peers) == 0:
|
||||
if min_peers > 0:
|
||||
raise RuntimeError(f"Federation requires {min_peers} peers, but none found")
|
||||
|
||||
# Solo mode - just return local result
|
||||
return FederationResult(
|
||||
final_response=local_best.text,
|
||||
local_confidence=local_confidence,
|
||||
peer_votes=[],
|
||||
strategy="solo"
|
||||
)
|
||||
|
||||
print(f" 🌐 Requesting votes from {len(peers)} peer(s)...")
|
||||
for peer in peers:
|
||||
print(f" → Contacting {peer.name} at {peer.api_url}")
|
||||
|
||||
peer_votes = []
|
||||
|
||||
# Start peer requests
|
||||
vote_tasks = [
|
||||
self.federation_client.request_vote(peer, prompt, max_tokens, temperature)
|
||||
for peer in peers
|
||||
]
|
||||
|
||||
results = await asyncio.gather(*vote_tasks, return_exceptions=True)
|
||||
|
||||
for peer, result in zip(peers, results):
|
||||
|
||||
# Run everything in parallel
|
||||
all_tasks = [local_task] + vote_tasks
|
||||
results = await asyncio.gather(*all_tasks, return_exceptions=True)
|
||||
|
||||
# Separate local result from peer votes
|
||||
local_result_raw = results[0]
|
||||
if isinstance(local_result_raw, Exception):
|
||||
print(f" ✗ Local swarm failed: {local_result_raw}")
|
||||
raise RuntimeError(f"Local generation failed: {local_result_raw}")
|
||||
|
||||
from swarm.manager import ConsensusResult
|
||||
local_result: ConsensusResult = local_result_raw # Now guaranteed not to be an exception
|
||||
local_best = local_result.selected_response
|
||||
local_confidence = local_result.confidence
|
||||
local_tps = local_best.tokens_per_second
|
||||
total_tokens_generated += local_best.tokens_generated
|
||||
print(f" ✓ Local completed (confidence: {local_confidence:.2f}, {local_tps:.1f} t/s)")
|
||||
|
||||
# Collect peer votes
|
||||
peer_votes = []
|
||||
for peer, result in zip(peers, results[1:]):
|
||||
if isinstance(result, Exception):
|
||||
print(f" ✗ Peer {peer.name} failed: {result}")
|
||||
elif result is not None:
|
||||
peer_votes.append(result)
|
||||
print(f" ✓ Peer {peer.name} voted (confidence: {result.confidence:.2f})")
|
||||
total_tokens_generated += result.tokens_generated if hasattr(result, 'tokens_generated') else 0
|
||||
print(f" ✓ Peer {peer.name} completed (confidence: {result.confidence:.2f}, {result.tokens_per_second:.1f} t/s)")
|
||||
|
||||
if len(peer_votes) == 0:
|
||||
# No peers responded, use local result
|
||||
print(" ⚠️ No peers responded, using local result")
|
||||
|
||||
# Calculate global t/sec even in fallback mode
|
||||
federation_end_time = time.time()
|
||||
total_elapsed_seconds = federation_end_time - federation_start_time
|
||||
global_tps = total_tokens_generated / total_elapsed_seconds if total_elapsed_seconds > 0 else 0.0
|
||||
|
||||
print(f"\n 📊 Global Performance:")
|
||||
print(f" Total tokens: {total_tokens_generated}")
|
||||
print(f" Total time: {total_elapsed_seconds:.2f}s")
|
||||
print(f" Global speed: {global_tps:.1f} t/s")
|
||||
|
||||
return FederationResult(
|
||||
final_response=local_best.text,
|
||||
local_confidence=local_confidence,
|
||||
peer_votes=[],
|
||||
strategy="local_fallback"
|
||||
strategy="local_fallback",
|
||||
global_tokens_per_second=global_tps
|
||||
)
|
||||
|
||||
# Phase 3: Global consensus
|
||||
# Global consensus
|
||||
print(f" 🗳️ Running global consensus ({len(peer_votes) + 1} votes)...")
|
||||
final_response, winner = self._weighted_vote(local_best.text, local_confidence, peer_votes)
|
||||
|
||||
final_response = self._weighted_vote(local_best.text, local_confidence, peer_votes)
|
||||
# Calculate global tokens/sec including sync + voting overhead
|
||||
federation_end_time = time.time()
|
||||
total_elapsed_seconds = federation_end_time - federation_start_time
|
||||
global_tps = total_tokens_generated / total_elapsed_seconds if total_elapsed_seconds > 0 else 0.0
|
||||
|
||||
print(f"\n 📊 Global Performance:")
|
||||
print(f" Total tokens: {total_tokens_generated}")
|
||||
print(f" Total time: {total_elapsed_seconds:.2f}s")
|
||||
print(f" Global speed: {global_tps:.1f} t/s (includes sync + voting)")
|
||||
|
||||
return FederationResult(
|
||||
final_response=final_response,
|
||||
local_confidence=local_confidence,
|
||||
peer_votes=peer_votes,
|
||||
strategy=self.consensus_strategy
|
||||
strategy=self.consensus_strategy,
|
||||
winner=winner,
|
||||
global_tokens_per_second=global_tps
|
||||
)
|
||||
|
||||
def _weighted_vote(
|
||||
@@ -257,11 +336,14 @@ class FederatedSwarm:
|
||||
local_response: str,
|
||||
local_confidence: float,
|
||||
peer_votes: List[PeerVote]
|
||||
) -> str:
|
||||
) -> Tuple[str, str]:
|
||||
"""
|
||||
Select best response using weighted voting.
|
||||
|
||||
Weights by confidence score. Higher confidence = more weight.
|
||||
|
||||
Returns:
|
||||
Tuple of (selected_response, winner_name)
|
||||
"""
|
||||
# Collect all votes with their weights
|
||||
all_votes = [(local_response, local_confidence, "local")]
|
||||
@@ -269,38 +351,39 @@ class FederatedSwarm:
|
||||
for vote in peer_votes:
|
||||
all_votes.append((vote.response_text, vote.confidence, vote.peer_name))
|
||||
|
||||
if self.consensus_strategy == "best_of_n":
|
||||
# Use the consensus engine to pick the best response
|
||||
from swarm.consensus import ConsensusEngine
|
||||
# Always use quality-based selection - the head node judges ALL responses
|
||||
# This prevents nodes from being overconfident about their own mediocre answers
|
||||
from swarm.consensus import ConsensusEngine, GenerationResponse
|
||||
|
||||
responses = [
|
||||
GenerationResponse(
|
||||
text=text,
|
||||
tokens_generated=0,
|
||||
tokens_per_second=0,
|
||||
latency_ms=0,
|
||||
backend_name=source
|
||||
)
|
||||
for text, _, source in all_votes
|
||||
]
|
||||
responses = [
|
||||
GenerationResponse(
|
||||
text=text,
|
||||
tokens_generated=0,
|
||||
tokens_per_second=0,
|
||||
latency_ms=0,
|
||||
backend_name=source
|
||||
)
|
||||
for text, _, source in all_votes
|
||||
]
|
||||
|
||||
# Use synchronous quality scoring (no embeddings needed)
|
||||
engine = ConsensusEngine(strategy="quality")
|
||||
# _quality_vote is async but only uses sync scoring, so we
|
||||
# use the simpler _fastest_vote-style approach here
|
||||
scores = [engine._quality_score(r) for r in responses]
|
||||
best_idx = max(range(len(scores)), key=lambda i: scores[i])
|
||||
best = all_votes[best_idx]
|
||||
print(f" ✓ Selected response from {best[2]} (quality score: {scores[best_idx]:.2f})")
|
||||
return best[0]
|
||||
|
||||
# Default: weighted selection - pick highest confidence
|
||||
best = max(all_votes, key=lambda x: x[1])
|
||||
print(f" ✓ Selected response from {best[2]} (confidence: {best[1]:.2f})")
|
||||
return best[0]
|
||||
# Use quality scoring to objectively compare all responses
|
||||
engine = ConsensusEngine(strategy="quality")
|
||||
scores = [engine._quality_score(r) for r in responses]
|
||||
|
||||
# Find best response based on actual quality, not self-reported confidence
|
||||
best_idx = max(range(len(scores)), key=lambda i: scores[i])
|
||||
best = all_votes[best_idx]
|
||||
|
||||
# Show comparison
|
||||
print(f" 📊 Quality scores:")
|
||||
for i, (text, conf, source) in enumerate(all_votes):
|
||||
print(f" {source}: {scores[i]:.2f} (self-reported: {conf:.2f})")
|
||||
|
||||
print(f" ✓ Selected response from {best[2]} (quality score: {scores[best_idx]:.2f})")
|
||||
return best[0], best[2]
|
||||
|
||||
async def get_federation_status(self) -> Dict[str, Any]:
|
||||
"""Get current federation status."""
|
||||
"""Get current federation status with peer metrics."""
|
||||
peers = self.discovery.get_peers()
|
||||
|
||||
# Check health of all peers
|
||||
@@ -308,7 +391,24 @@ class FederatedSwarm:
|
||||
health_results = await asyncio.gather(*health_checks, return_exceptions=True)
|
||||
|
||||
healthy_peers = []
|
||||
peer_metrics_info = []
|
||||
|
||||
for peer, healthy in zip(peers, health_results):
|
||||
peer_info = {
|
||||
"name": peer.name,
|
||||
"healthy": healthy is True,
|
||||
"timeout": peer.timeout_seconds,
|
||||
"model": peer.model_id,
|
||||
"instances": peer.instances,
|
||||
"metrics": {
|
||||
"success_rate": peer.metrics.success_rate,
|
||||
"avg_latency_ms": round(peer.metrics.avg_latency_ms, 2),
|
||||
"total_requests": peer.metrics.total_requests,
|
||||
"last_error": peer.metrics.last_error,
|
||||
}
|
||||
}
|
||||
peer_metrics_info.append(peer_info)
|
||||
|
||||
if healthy is True:
|
||||
healthy_peers.append(peer.name)
|
||||
|
||||
@@ -317,6 +417,7 @@ class FederatedSwarm:
|
||||
"total_peers": len(peers),
|
||||
"healthy_peers": len(healthy_peers),
|
||||
"peer_names": [p.name for p in peers],
|
||||
"peer_details": peer_metrics_info,
|
||||
"strategy": self.consensus_strategy
|
||||
}
|
||||
|
||||
|
||||
+21
-3
@@ -232,7 +232,7 @@ class SwarmManager:
|
||||
response = await worker.generate_with_progress(request)
|
||||
responses.append(response)
|
||||
if not self.mcp_mode:
|
||||
print(f" ✓ {worker.name} completed ({response.tokens_generated} tokens)")
|
||||
print(f" ✓ {worker.name} completed ({response.tokens_generated} tokens, {response.tokens_per_second:.1f} t/s)")
|
||||
except Exception as e:
|
||||
responses.append(e)
|
||||
if not self.mcp_mode:
|
||||
@@ -283,6 +283,11 @@ class SwarmManager:
|
||||
|
||||
if not self.mcp_mode:
|
||||
print(f" Got {len(valid_responses)} valid responses")
|
||||
|
||||
# Print performance summary
|
||||
print(f"\n 📊 Performance Summary:")
|
||||
for i, resp in enumerate(valid_responses, 1):
|
||||
print(f" Worker {i}: {resp.tokens_generated} tokens @ {resp.tokens_per_second:.1f} t/s ({resp.latency_ms:.0f}ms)")
|
||||
|
||||
# Run consensus
|
||||
result = await self.consensus.select_best(valid_responses)
|
||||
@@ -352,13 +357,21 @@ class SwarmManager:
|
||||
if not self.mcp_mode:
|
||||
print(f"🔄 Starting stream from {fastest_worker.name}...")
|
||||
chunk_count = 0
|
||||
total_chars = 0
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
async for chunk in fastest_worker.generate_with_progress_stream(request):
|
||||
chunk_count += 1
|
||||
total_chars += len(chunk)
|
||||
if not self.mcp_mode and chunk_count % 50 == 0: # Print progress every 50 chunks
|
||||
print(f" Streamed {chunk_count} chunks...")
|
||||
yield chunk
|
||||
end_time = asyncio.get_event_loop().time()
|
||||
duration = end_time - start_time
|
||||
# Estimate tokens (roughly 4 chars per token)
|
||||
estimated_tokens = total_chars // 4
|
||||
tps = estimated_tokens / duration if duration > 0 else 0
|
||||
if not self.mcp_mode:
|
||||
print(f" Stream complete: {chunk_count} chunks total")
|
||||
print(f" Stream complete: {chunk_count} chunks, {estimated_tokens} tokens, {tps:.1f} t/s")
|
||||
|
||||
def get_status(self) -> SwarmStatus:
|
||||
"""Get current swarm status."""
|
||||
@@ -494,7 +507,7 @@ class SwarmManager:
|
||||
try:
|
||||
response = await worker.generate_with_progress(request)
|
||||
responses.append(response)
|
||||
print(f" ✓ Response {i+1} completed ({response.tokens_generated} tokens)")
|
||||
print(f" ✓ Response {i+1} completed ({response.tokens_generated} tokens, {response.tokens_per_second:.1f} t/s)")
|
||||
except Exception as e:
|
||||
responses.append(e)
|
||||
print(f" ✗ Response {i+1} failed: {e}")
|
||||
@@ -513,6 +526,11 @@ class SwarmManager:
|
||||
|
||||
print(f" Got {len(valid_responses)} valid responses")
|
||||
|
||||
# Print performance summary
|
||||
print(f"\n 📊 Performance Summary:")
|
||||
for i, resp in enumerate(valid_responses, 1):
|
||||
print(f" Seed {i}: {resp.tokens_generated} tokens @ {resp.tokens_per_second:.1f} t/s ({resp.latency_ms:.0f}ms)")
|
||||
|
||||
# Run consensus
|
||||
result = await self.consensus.select_best(valid_responses)
|
||||
print(f" Selected response using '{result.strategy}' strategy (confidence: {result.confidence:.2f})")
|
||||
|
||||
@@ -0,0 +1,103 @@
|
||||
"""Swarm orchestration for Local Swarm.
|
||||
|
||||
Handles generation orchestration across multiple workers.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import List, Optional
|
||||
|
||||
from backends.base import GenerationRequest, GenerationResponse
|
||||
from swarm.consensus import ConsensusResult
|
||||
from swarm.worker import SwarmWorker
|
||||
|
||||
|
||||
class SwarmOrchestrator:
|
||||
"""Orchestrates generation across multiple workers."""
|
||||
|
||||
def __init__(self, workers: List[SwarmWorker], sequential_mode: bool = False, mcp_mode: bool = False):
|
||||
"""Initialize orchestrator.
|
||||
|
||||
Args:
|
||||
workers: List of swarm workers
|
||||
sequential_mode: Whether to run workers sequentially
|
||||
mcp_mode: Whether to suppress console output
|
||||
"""
|
||||
self.workers = workers
|
||||
self.sequential_mode = sequential_mode
|
||||
self.mcp_mode = mcp_mode
|
||||
|
||||
async def generate_single(
|
||||
self,
|
||||
worker: SwarmWorker,
|
||||
request: GenerationRequest
|
||||
) -> GenerationResponse:
|
||||
"""Generate using a single worker.
|
||||
|
||||
Args:
|
||||
worker: Worker to use
|
||||
request: Generation request
|
||||
|
||||
Returns:
|
||||
Generation response
|
||||
"""
|
||||
return await worker.generate_with_progress(request)
|
||||
|
||||
async def generate_parallel(
|
||||
self,
|
||||
workers: List[SwarmWorker],
|
||||
request: GenerationRequest
|
||||
) -> List[GenerationResponse]:
|
||||
"""Generate using multiple workers in parallel.
|
||||
|
||||
Args:
|
||||
workers: List of workers
|
||||
request: Generation request
|
||||
|
||||
Returns:
|
||||
List of generation responses
|
||||
"""
|
||||
tasks = [w.generate_with_progress(request) for w in workers]
|
||||
return await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
async def generate_sequential(
|
||||
self,
|
||||
workers: List[SwarmWorker],
|
||||
request: GenerationRequest
|
||||
) -> List[GenerationResponse]:
|
||||
"""Generate using multiple workers sequentially.
|
||||
|
||||
Args:
|
||||
workers: List of workers
|
||||
request: Generation request
|
||||
|
||||
Returns:
|
||||
List of generation responses
|
||||
"""
|
||||
responses = []
|
||||
for worker in workers:
|
||||
try:
|
||||
response = await worker.generate_with_progress(request)
|
||||
responses.append(response)
|
||||
except Exception as e:
|
||||
responses.append(e)
|
||||
return responses
|
||||
|
||||
def filter_responses(
|
||||
self,
|
||||
responses: List,
|
||||
workers: List[SwarmWorker]
|
||||
) -> List[GenerationResponse]:
|
||||
"""Filter out error responses.
|
||||
|
||||
Args:
|
||||
responses: List of responses (may contain exceptions)
|
||||
workers: Corresponding workers
|
||||
|
||||
Returns:
|
||||
List of valid responses
|
||||
"""
|
||||
valid = []
|
||||
for i, resp in enumerate(responses):
|
||||
if not isinstance(resp, Exception):
|
||||
valid.append(resp)
|
||||
return valid
|
||||
@@ -66,25 +66,19 @@ class StatusMonitor:
|
||||
if not self.swarm_manager or not self.swarm_manager.workers:
|
||||
return
|
||||
|
||||
# Clear previous display
|
||||
self._clear_display()
|
||||
|
||||
# Get worker status
|
||||
workers = self.swarm_manager.workers
|
||||
generating_workers = [w for w in workers if w._is_generating]
|
||||
|
||||
if not generating_workers:
|
||||
# No active generation, show minimal status
|
||||
lines = []
|
||||
lines.append("📊 Workers Idle")
|
||||
for w in workers:
|
||||
status = "🟢" if w.is_healthy() else "🔴"
|
||||
ip_str = f" [{w._ip_address}]" if w._is_remote else ""
|
||||
lines.append(f" {status} {w.name}{ip_str}: Idle")
|
||||
|
||||
self._print_lines(lines)
|
||||
# No active generation, clear display and return (don't spam "Workers Idle")
|
||||
if self._last_lines > 0:
|
||||
self._clear_display()
|
||||
return
|
||||
|
||||
# Clear previous display
|
||||
self._clear_display()
|
||||
|
||||
# Active generation - show detailed status
|
||||
lines = []
|
||||
lines.append(f"⚡ {len(generating_workers)} Worker{'s' if len(generating_workers) > 1 else ''} Active")
|
||||
|
||||
+71
-41
@@ -5,12 +5,14 @@ Remote execution allows a single "tool host" to manage the workspace
|
||||
while workers perform distributed generation.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import aiohttp
|
||||
from typing import Optional
|
||||
|
||||
from utils.project_discovery import discover_project_root
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -84,29 +86,7 @@ class ToolExecutor:
|
||||
logger.debug(f" ❌ Error contacting tool host: {e}")
|
||||
return f"Error contacting tool host: {str(e)}"
|
||||
|
||||
def _discover_project_root(self, start_dir: Optional[str] = None) -> str:
|
||||
"""Discover the project root directory by looking for common markers."""
|
||||
import os
|
||||
if start_dir is None:
|
||||
start_dir = os.getcwd()
|
||||
current = os.path.abspath(start_dir)
|
||||
|
||||
# Common project root markers
|
||||
markers = ['.git', 'package.json', 'pyproject.toml', 'Cargo.toml', 'go.mod',
|
||||
'requirements.txt', 'setup.py', 'pom.xml', 'build.gradle', '.project', '.venv']
|
||||
|
||||
while True:
|
||||
try:
|
||||
if any(os.path.exists(os.path.join(current, marker)) for marker in markers):
|
||||
return current
|
||||
except Exception:
|
||||
pass # Permission errors, just skip
|
||||
parent = os.path.dirname(current)
|
||||
if parent == current: # Reached filesystem root
|
||||
break
|
||||
current = parent
|
||||
|
||||
return start_dir
|
||||
|
||||
|
||||
async def _execute_local(self, tool_name: str, tool_args: dict) -> str:
|
||||
"""Execute tool locally."""
|
||||
@@ -117,6 +97,8 @@ class ToolExecutor:
|
||||
return await self._execute_write(tool_args)
|
||||
elif tool_name == "bash":
|
||||
return await self._execute_bash(tool_args)
|
||||
elif tool_name == "webfetch":
|
||||
return await self._execute_webfetch(tool_args)
|
||||
elif tool_name == "question":
|
||||
return f"Question: {tool_args}"
|
||||
elif tool_name == "skill":
|
||||
@@ -127,7 +109,7 @@ class ToolExecutor:
|
||||
return "Current todo list: (empty)"
|
||||
else:
|
||||
return f"Tool '{tool_name}' not implemented"
|
||||
|
||||
|
||||
except Exception as e:
|
||||
return f"Error executing {tool_name}: {str(e)}"
|
||||
|
||||
@@ -139,6 +121,13 @@ class ToolExecutor:
|
||||
if not file_path:
|
||||
return "Error: filePath required"
|
||||
|
||||
# Check if original path was absolute or used ~ before expansion
|
||||
original_was_absolute = os.path.isabs(file_path) or file_path.startswith("~")
|
||||
|
||||
# Expand ~ to home directory
|
||||
file_path = os.path.expanduser(file_path)
|
||||
working_dir = os.path.expanduser(working_dir)
|
||||
|
||||
# Security: Prevent directory traversal
|
||||
file_path = os.path.normpath(file_path)
|
||||
if file_path.startswith("..") or file_path.startswith("/.."):
|
||||
@@ -150,14 +139,16 @@ class ToolExecutor:
|
||||
else:
|
||||
full_path = file_path
|
||||
|
||||
# Additional security: ensure resolved path is within working_dir
|
||||
try:
|
||||
real_working_dir = os.path.realpath(working_dir)
|
||||
real_full_path = os.path.realpath(full_path)
|
||||
if not real_full_path.startswith(real_working_dir):
|
||||
return f"Error: Access denied - path outside working directory"
|
||||
except Exception:
|
||||
pass # If realpath fails, continue anyway
|
||||
# Additional security: only enforce working_dir restriction for relative paths
|
||||
# If user explicitly specified an absolute path or ~ path, allow it
|
||||
if not original_was_absolute:
|
||||
try:
|
||||
real_working_dir = os.path.realpath(working_dir)
|
||||
real_full_path = os.path.realpath(full_path)
|
||||
if not real_full_path.startswith(real_working_dir):
|
||||
return f"Error: Access denied - path outside working directory"
|
||||
except Exception:
|
||||
pass # If realpath fails, continue anyway
|
||||
|
||||
logger.debug(f" 📁 Reading: {file_path}")
|
||||
logger.debug(f" 📍 Working dir: {working_dir}")
|
||||
@@ -181,6 +172,13 @@ class ToolExecutor:
|
||||
if not file_path:
|
||||
return "Error: filePath required"
|
||||
|
||||
# Check if original path was absolute or used ~ before expansion
|
||||
original_was_absolute = os.path.isabs(file_path) or file_path.startswith("~")
|
||||
|
||||
# Expand ~ to home directory
|
||||
file_path = os.path.expanduser(file_path)
|
||||
working_dir = os.path.expanduser(working_dir)
|
||||
|
||||
# Security: Prevent directory traversal
|
||||
file_path = os.path.normpath(file_path)
|
||||
if file_path.startswith("..") or file_path.startswith("/.."):
|
||||
@@ -192,14 +190,16 @@ class ToolExecutor:
|
||||
else:
|
||||
full_path = file_path
|
||||
|
||||
# Additional security: ensure resolved path is within working_dir
|
||||
try:
|
||||
real_working_dir = os.path.realpath(working_dir)
|
||||
real_full_path = os.path.realpath(full_path)
|
||||
if not real_full_path.startswith(real_working_dir):
|
||||
return f"Error: Access denied - path outside working directory"
|
||||
except Exception:
|
||||
pass # If realpath fails, continue anyway
|
||||
# Additional security: only enforce working_dir restriction for relative paths
|
||||
# If user explicitly specified an absolute path or ~ path, allow it
|
||||
if not original_was_absolute:
|
||||
try:
|
||||
real_working_dir = os.path.realpath(working_dir)
|
||||
real_full_path = os.path.realpath(full_path)
|
||||
if not real_full_path.startswith(real_working_dir):
|
||||
return f"Error: Access denied - path outside working directory"
|
||||
except Exception:
|
||||
pass # If realpath fails, continue anyway
|
||||
|
||||
logger.debug(f" 📁 Writing: {file_path}")
|
||||
logger.debug(f" 📍 Working dir: {working_dir}")
|
||||
@@ -226,6 +226,9 @@ class ToolExecutor:
|
||||
if not command:
|
||||
return "Error: command required"
|
||||
|
||||
# Expand ~ to home directory in cwd
|
||||
cwd = os.path.expanduser(cwd)
|
||||
|
||||
# Security: Block dangerous commands
|
||||
dangerous = ["rm -rf /", "> /dev", "mkfs", "dd if=/dev/zero", ":(){ :|:& };:"]
|
||||
for d in dangerous:
|
||||
@@ -328,7 +331,34 @@ class ToolExecutor:
|
||||
logger.debug(f" 📄 Partial output (last 500 chars): ...{partial_output[-500:]}")
|
||||
|
||||
return f"Error executing bash: {error_msg}"
|
||||
|
||||
|
||||
async def _execute_webfetch(self, args: dict) -> str:
|
||||
"""Execute webfetch tool."""
|
||||
url = args.get("url", "")
|
||||
format = args.get("format", "text") # Default to text
|
||||
|
||||
if not url:
|
||||
return "Error: url required"
|
||||
|
||||
logger.debug(f" 🌐 Fetching: {url[:100]}... (format: {format})")
|
||||
|
||||
try:
|
||||
session = await self._get_session()
|
||||
async with session.get(url, timeout=aiohttp.ClientTimeout(total=30)) as resp:
|
||||
if resp.status == 200:
|
||||
content = await resp.text()
|
||||
logger.debug(f" ✓ Fetched {len(content)} chars")
|
||||
return content
|
||||
else:
|
||||
logger.debug(f" ❌ HTTP {resp.status}: {url[:100]}")
|
||||
return f"Error: HTTP {resp.status} from {url[:100]}"
|
||||
except asyncio.TimeoutError:
|
||||
logger.debug(f" ⏰ Timeout fetching: {url[:100]}")
|
||||
return f"Error: Timeout fetching {url[:100]} (30s)"
|
||||
except Exception as e:
|
||||
logger.debug(f" ❌ Error: {e}")
|
||||
return f"Error fetching {url[:100]}: {str(e)}"
|
||||
|
||||
async def close(self):
|
||||
"""Close HTTP session."""
|
||||
if self._session:
|
||||
|
||||
@@ -7,11 +7,11 @@ import logging
|
||||
import sys
|
||||
|
||||
|
||||
def setup_logging(level=logging.DEBUG):
|
||||
def setup_logging(level=logging.INFO):
|
||||
"""Set up logging configuration.
|
||||
|
||||
Args:
|
||||
level: Logging level (default: DEBUG for development)
|
||||
level: Logging level (default: INFO)
|
||||
"""
|
||||
# Create formatter
|
||||
formatter = logging.Formatter(
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
"""Network utilities for Local Swarm."""
|
||||
|
||||
import socket
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def get_local_ip() -> str:
|
||||
"""Get the local network IP address (private networks only).
|
||||
|
||||
Returns:
|
||||
Local IP address or 127.0.0.1 if detection fails
|
||||
"""
|
||||
try:
|
||||
# Create a socket and connect to a public DNS server
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
s.settimeout(2)
|
||||
# Try to connect to Google's DNS - this doesn't actually send data
|
||||
s.connect(("8.8.8.8", 80))
|
||||
ip = s.getsockname()[0]
|
||||
s.close()
|
||||
|
||||
# Check if it's a private IP
|
||||
is_private = ip.startswith('192.168.')
|
||||
|
||||
if is_private:
|
||||
print(f" 📡 Detected local IP: {ip}")
|
||||
return ip
|
||||
else:
|
||||
print(f" ⚠️ IP {ip} is not a private network, binding to localhost")
|
||||
return "127.0.0.1"
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Could not detect local IP: {e}, using localhost")
|
||||
return "127.0.0.1"
|
||||
|
||||
|
||||
def is_private_ip(ip: str) -> bool:
|
||||
"""Check if an IP address is private.
|
||||
|
||||
Args:
|
||||
ip: IP address string
|
||||
|
||||
Returns:
|
||||
True if IP is private
|
||||
"""
|
||||
return ip.startswith('192.168.') or ip.startswith('10.') or ip.startswith('172.16.')
|
||||
@@ -0,0 +1,86 @@
|
||||
"""Project root discovery utilities.
|
||||
|
||||
Provides functionality to discover project root directories.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Optional, List
|
||||
|
||||
|
||||
# Common project root markers
|
||||
DEFAULT_MARKERS = [
|
||||
'.git', 'package.json', 'pyproject.toml', 'Cargo.toml', 'go.mod',
|
||||
'requirements.txt', 'setup.py', 'pom.xml', 'build.gradle', '.project', '.venv'
|
||||
]
|
||||
|
||||
|
||||
def discover_project_root(
|
||||
start_dir: Optional[str] = None,
|
||||
markers: Optional[List[str]] = None
|
||||
) -> str:
|
||||
"""Discover the project root directory by looking for common markers.
|
||||
|
||||
Args:
|
||||
start_dir: Directory to start searching from (defaults to cwd)
|
||||
markers: List of marker files/directories to look for (defaults to DEFAULT_MARKERS)
|
||||
|
||||
Returns:
|
||||
Path to project root, or start_dir if no markers found
|
||||
"""
|
||||
if start_dir is None:
|
||||
start_dir = os.getcwd()
|
||||
|
||||
if markers is None:
|
||||
markers = DEFAULT_MARKERS
|
||||
|
||||
current = os.path.abspath(start_dir)
|
||||
|
||||
while True:
|
||||
try:
|
||||
if any(os.path.exists(os.path.join(current, marker)) for marker in markers):
|
||||
return current
|
||||
except (OSError, PermissionError):
|
||||
pass # Permission errors, just skip
|
||||
|
||||
parent = os.path.dirname(current)
|
||||
if parent == current: # Reached filesystem root
|
||||
break
|
||||
current = parent
|
||||
|
||||
return start_dir
|
||||
|
||||
|
||||
def is_within_project(path: str, project_root: str) -> bool:
|
||||
"""Check if a path is within a project root.
|
||||
|
||||
Args:
|
||||
path: Path to check
|
||||
project_root: Project root directory
|
||||
|
||||
Returns:
|
||||
True if path is within project root
|
||||
"""
|
||||
try:
|
||||
real_path = os.path.realpath(path)
|
||||
real_root = os.path.realpath(project_root)
|
||||
return real_path.startswith(real_root)
|
||||
except (OSError, ValueError):
|
||||
return False
|
||||
|
||||
|
||||
def get_relative_to_project(path: str, project_root: str) -> str:
|
||||
"""Get path relative to project root.
|
||||
|
||||
Args:
|
||||
path: Absolute or relative path
|
||||
project_root: Project root directory
|
||||
|
||||
Returns:
|
||||
Path relative to project root
|
||||
"""
|
||||
try:
|
||||
real_path = os.path.realpath(path)
|
||||
real_root = os.path.realpath(project_root)
|
||||
return os.path.relpath(real_path, real_root)
|
||||
except (OSError, ValueError):
|
||||
return path
|
||||
@@ -0,0 +1,90 @@
|
||||
"""Token counting utilities for Local Swarm.
|
||||
|
||||
Centralizes token counting functionality to avoid duplication across modules.
|
||||
"""
|
||||
|
||||
import tiktoken
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# Initialize tokenizer for accurate token counting
|
||||
TOKEN_ENCODING = tiktoken.get_encoding('cl100k_base')
|
||||
|
||||
|
||||
def count_tokens(text: str) -> int:
|
||||
"""Count tokens in a text string using tiktoken.
|
||||
|
||||
Args:
|
||||
text: Text to count tokens for
|
||||
|
||||
Returns:
|
||||
Number of tokens
|
||||
"""
|
||||
if not text:
|
||||
return 0
|
||||
return len(TOKEN_ENCODING.encode(text))
|
||||
|
||||
|
||||
def count_tokens_in_messages(messages: list) -> int:
|
||||
"""Count tokens in a list of messages.
|
||||
|
||||
Args:
|
||||
messages: List of message objects with content attribute
|
||||
|
||||
Returns:
|
||||
Total token count
|
||||
"""
|
||||
total = 0
|
||||
for msg in messages:
|
||||
if hasattr(msg, 'content') and msg.content:
|
||||
total += count_tokens(msg.content)
|
||||
return total
|
||||
|
||||
|
||||
def estimate_tokens_from_characters(char_count: int, chars_per_token: int = 4) -> int:
|
||||
"""Estimate token count from character count.
|
||||
|
||||
This is a fallback when tiktoken is not available or for quick estimates.
|
||||
|
||||
Args:
|
||||
char_count: Number of characters
|
||||
chars_per_token: Average characters per token (default 4)
|
||||
|
||||
Returns:
|
||||
Estimated token count
|
||||
"""
|
||||
return char_count // chars_per_token
|
||||
|
||||
|
||||
def truncate_to_max_tokens(text: str, max_tokens: int) -> str:
|
||||
"""Truncate text to fit within max tokens.
|
||||
|
||||
Args:
|
||||
text: Text to truncate
|
||||
max_tokens: Maximum number of tokens allowed
|
||||
|
||||
Returns:
|
||||
Truncated text
|
||||
"""
|
||||
tokens = TOKEN_ENCODING.encode(text)
|
||||
if len(tokens) <= max_tokens:
|
||||
return text
|
||||
truncated = tokens[:max_tokens]
|
||||
return TOKEN_ENCODING.decode(truncated)
|
||||
|
||||
|
||||
def format_token_info(prompt_tokens: int, completion_tokens: int) -> dict:
|
||||
"""Format token information for responses.
|
||||
|
||||
Args:
|
||||
prompt_tokens: Number of prompt tokens
|
||||
completion_tokens: Number of completion tokens
|
||||
|
||||
Returns:
|
||||
Dictionary with token counts
|
||||
"""
|
||||
return {
|
||||
"prompt_tokens": prompt_tokens,
|
||||
"completion_tokens": completion_tokens,
|
||||
"total_tokens": prompt_tokens + completion_tokens
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
# Patch to add real-time streaming for tools
|
||||
|
||||
# This patch adds real-time streaming of assistant content ("thinking") and tool calls
|
||||
# when tools are used. Previously, all content was buffered until complete,
|
||||
# causing opencode to wait with no feedback.
|
||||
|
||||
# Key changes:
|
||||
# 1. Stream model output incrementally as it's generated
|
||||
# 2. Parse for tool_calls and content in each chunk
|
||||
# 3. Send content chunks immediately (the "thinking")
|
||||
# 4. Send tool_calls deltas immediately when found
|
||||
# 5. Don't execute tools server-side in streaming mode
|
||||
# 6. Send DONE marker at end
|
||||
|
||||
# Apply this patch with:
|
||||
# patch -p1 < this_file src/api/routes.py
|
||||
@@ -0,0 +1,63 @@
|
||||
## Test Plan for CUDA and Android Support
|
||||
|
||||
### Unit Tests
|
||||
|
||||
#### Test Case 1: NVIDIA GPU Detection
|
||||
- **Input:** System with NVIDIA GPU and pynvml installed
|
||||
- **Expected Output:** GPUInfo with correct name, VRAM, and is_nvidia=True
|
||||
- **Location:** src/hardware/detector.py:detect_nvidia_gpu()
|
||||
|
||||
#### Test Case 2: GPU Layer Configuration for CUDA
|
||||
- **Input:** HardwareProfile with NVIDIA GPU (4GB VRAM)
|
||||
- **Expected Output:** n_gpu_layers=-1 (all layers), proper CUDA configuration
|
||||
- **Location:** src/backends/__init__.py:create_backend()
|
||||
|
||||
#### Test Case 3: Android Platform Detection
|
||||
- **Input:** platform.system() returns 'Linux', Termux environment detected
|
||||
- **Expected Output:** is_android=True, proper Android path handling
|
||||
- **Location:** src/hardware/detector.py:detect_android()
|
||||
|
||||
#### Test Case 4: PeerInfo with Timeout
|
||||
- **Input:** PeerInfo with custom timeout
|
||||
- **Expected Output:** FederationClient respects peer timeout
|
||||
- **Location:** src/network/discovery.py:PeerInfo
|
||||
|
||||
### Integration Tests
|
||||
|
||||
#### End-to-End Flow 1: CUDA Backend Creation
|
||||
1. Detect hardware with NVIDIA GPU
|
||||
2. Create backend via factory
|
||||
3. Verify n_gpu_layers=-1 set
|
||||
4. Load test model
|
||||
5. Expected: Successful GPU offload
|
||||
|
||||
#### End-to-End Flow 2: Android Device Join Federation
|
||||
1. Start discovery on Android (Termux)
|
||||
2. Advertise Android hardware
|
||||
3. Join federation from macOS peer
|
||||
4. Send vote request
|
||||
5. Expected: Android responds successfully
|
||||
|
||||
#### End-to-End Flow 3: Federation with Per-Peer Timeout
|
||||
1. Add peer with 30s timeout
|
||||
2. Add peer with 60s timeout
|
||||
3. Request votes from both
|
||||
4. Expected: Each peer uses its own timeout
|
||||
|
||||
### Manual Verification
|
||||
|
||||
#### Command to Run:
|
||||
```bash
|
||||
python -m pytest tests/ -v -k "cuda or android or federation"
|
||||
```
|
||||
|
||||
#### Expected Output:
|
||||
- All tests pass
|
||||
- No ImportError for pynvml
|
||||
- GPU layer detection works on CUDA machines
|
||||
- Android detection passes on Termux
|
||||
|
||||
#### Platform Testing:
|
||||
1. **macOS (Apple Silicon):** MLX backend loads
|
||||
2. **Linux (NVIDIA):** CUDA backend auto-detects
|
||||
3. **Android (Termux):** CPU-only mode, proper paths
|
||||
@@ -0,0 +1,140 @@
|
||||
"""Test Apple Silicon MLX auto-detection and download."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
||||
|
||||
def test_apple_silicon_mlx_selection():
|
||||
"""Test that Apple Silicon correctly selects MLX models."""
|
||||
from hardware.detector import HardwareProfile, GPUInfo
|
||||
from models.selector import select_optimal_model
|
||||
|
||||
# Mock Apple Silicon hardware
|
||||
class MockAppleHardware:
|
||||
os = "darwin"
|
||||
cpu_cores = 12
|
||||
ram_gb = 24.0
|
||||
ram_available_gb = 12.0
|
||||
is_apple_silicon = True
|
||||
has_dedicated_gpu = False
|
||||
gpu = GPUInfo(name="Apple Silicon GPU", vram_gb=24.0, driver_version=None)
|
||||
available_memory_gb = 12.0
|
||||
recommended_memory_gb = 12.0
|
||||
|
||||
hardware = MockAppleHardware()
|
||||
|
||||
# Test auto-detection (use_mlx=None)
|
||||
print("=" * 60)
|
||||
print("Apple Silicon MLX Auto-Detection Test")
|
||||
print("=" * 60)
|
||||
|
||||
print("\n1. Testing auto-detection (use_mlx=None)...")
|
||||
config = select_optimal_model(hardware, use_mlx=None)
|
||||
|
||||
assert config is not None, "Should find a model"
|
||||
print(f" ✓ Model selected: {config.model.name}")
|
||||
|
||||
# Verify quantization is MLX format (4bit, 8bit, etc.)
|
||||
print("\n2. Verifying MLX quantization format...")
|
||||
is_mlx_format = 'bit' in config.quantization.name.lower()
|
||||
assert is_mlx_format, f"Quantization should be MLX format (4bit/8bit), got {config.quantization.name}"
|
||||
print(f" ✓ Quantization: {config.quantization.name} (MLX format)")
|
||||
|
||||
# Test repository name generation
|
||||
print("\n3. Testing MLX repository name generation...")
|
||||
from models.registry import get_model_hf_repo_mlx
|
||||
|
||||
mlx_repo = get_model_hf_repo_mlx(config.model.id, config.variant, config.quantization)
|
||||
assert mlx_repo is not None, "MLX repository should be generated"
|
||||
assert "mlx-community" in mlx_repo, "Should use mlx-community namespace"
|
||||
assert "-Instruct-" in mlx_repo, "Should have -Instruct- suffix"
|
||||
assert config.quantization.name in mlx_repo, "Should include quantization"
|
||||
print(f" ✓ Repository: {mlx_repo}")
|
||||
|
||||
# Verify it's NOT using GGUF format
|
||||
print("\n4. Verifying NOT using GGUF format...")
|
||||
has_gguf = 'q4_k_m' in config.quantization.name or 'q5_k_m' in config.quantization.name
|
||||
has_gguf_suffix = '-GGUF' in mlx_repo
|
||||
assert not has_gguf, f"Should not use GGUF quantization names"
|
||||
assert not has_gguf_suffix, f"Should not use GGUF repository suffix"
|
||||
print(f" ✓ Not using GGUF format")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("All Apple Silicon MLX tests passed!")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
def test_nvidia_gpu_gguf_selection():
|
||||
"""Test that NVIDIA GPU correctly selects GGUF models."""
|
||||
from hardware.detector import HardwareProfile, GPUInfo
|
||||
from models.selector import select_optimal_model
|
||||
|
||||
# Mock NVIDIA hardware
|
||||
class MockNvidiaHardware:
|
||||
os = "linux"
|
||||
cpu_cores = 8
|
||||
ram_gb = 32.0
|
||||
ram_available_gb = 20.0
|
||||
is_apple_silicon = False
|
||||
has_dedicated_gpu = True
|
||||
gpu = GPUInfo(name="NVIDIA RTX 4090", vram_gb=24.0, driver_version="550.80")
|
||||
available_memory_gb = 20.0
|
||||
recommended_memory_gb = 20.0
|
||||
|
||||
hardware = MockNvidiaHardware()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("NVIDIA GPU GGUF Auto-Detection Test")
|
||||
print("=" * 60)
|
||||
|
||||
print("\n1. Testing auto-detection (use_mlx=None)...")
|
||||
config = select_optimal_model(hardware, use_mlx=None)
|
||||
|
||||
assert config is not None, "Should find a model"
|
||||
print(f" ✓ Model selected: {config.model.name}")
|
||||
|
||||
# Verify quantization is GGUF format (q4_k_m, q5_k_m, etc.)
|
||||
print("\n2. Verifying GGUF quantization format...")
|
||||
is_gguf_format = 'q' in config.quantization.name.lower()
|
||||
assert is_gguf_format, f"Quantization should be GGUF format (q4_k_m/q5_k_m), got {config.quantization.name}"
|
||||
print(f" ✓ Quantization: {config.quantization.name} (GGUF format)")
|
||||
|
||||
# Test repository name generation
|
||||
print("\n3. Testing GGUF repository name generation...")
|
||||
from models.registry import get_model_hf_repo
|
||||
|
||||
gguf_repo = get_model_hf_repo(config.model.id, config.variant, config.quantization)
|
||||
assert gguf_repo is not None, "GGUF repository should be generated"
|
||||
assert "-GGUF" in gguf_repo, "Should have -GGUF suffix"
|
||||
print(f" ✓ Repository: {gguf_repo}")
|
||||
|
||||
# Verify it's NOT using MLX format
|
||||
print("\n4. Verifying NOT using MLX format...")
|
||||
has_mlx_format = 'bit' in config.quantization.name.lower() and config.quantization.name not in ['q4_k_m', 'q5_k_m', 'q6_k']
|
||||
has_mlx_namespace = 'mlx-community' in gguf_repo
|
||||
assert not has_mlx_namespace, f"Should not use mlx-community namespace"
|
||||
print(f" ✓ Not using MLX format")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("All NVIDIA GPU GGUF tests passed!")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
test_apple_silicon_mlx_selection()
|
||||
test_nvidia_gpu_gguf_selection()
|
||||
print("\n" + "=" * 60)
|
||||
print("ALL AUTO-DETECTION TESTS PASSED!")
|
||||
print("=" * 60)
|
||||
except AssertionError as e:
|
||||
print(f"\n❌ Test failed: {e}")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n❌ Test error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -0,0 +1,100 @@
|
||||
"""End-to-end test for tool execution with a mock server.
|
||||
|
||||
This tests the complete flow:
|
||||
1. Model generates tool call
|
||||
2. Tools are executed
|
||||
3. Response is generated based on tool results
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tool_flow():
|
||||
"""Test the tool execution flow end-to-end."""
|
||||
|
||||
# Import after path is set
|
||||
from api.models import ChatMessage, ChatCompletionRequest
|
||||
from api.tool_parser import parse_tool_calls
|
||||
from api.formatting import format_messages_with_tools
|
||||
from tools.executor import ToolExecutor
|
||||
|
||||
print("=" * 60)
|
||||
print("End-to-End Tool Execution Test")
|
||||
print("=" * 60)
|
||||
|
||||
# Test 1: Parse tool call from model response
|
||||
print("\n1. Testing tool parsing...")
|
||||
model_response = "TOOL: bash\nARGUMENTS: {\"command\": \"echo hello\"}"
|
||||
|
||||
content, tool_calls = parse_tool_calls(model_response)
|
||||
assert tool_calls is not None, "Should parse tool call"
|
||||
assert len(tool_calls) == 1, "Should have one tool call"
|
||||
assert tool_calls[0]["function"]["name"] == "bash", "Should be bash tool"
|
||||
print(f" ✓ Parsed tool: {tool_calls[0]['function']['name']}")
|
||||
|
||||
# Test 2: Simulate tool result and format for next prompt
|
||||
print("\n2. Testing tool result formatting...")
|
||||
tool_result = "hello\n"
|
||||
|
||||
# Build conversation history
|
||||
messages = [
|
||||
ChatMessage(role="user", content="Run echo hello"),
|
||||
ChatMessage(role="assistant", content=model_response),
|
||||
ChatMessage(role="tool", content=tool_result)
|
||||
]
|
||||
|
||||
# Format for next generation
|
||||
next_prompt = format_messages_with_tools(messages, None)
|
||||
assert "tool" in next_prompt.lower(), "Prompt should include tool result"
|
||||
assert "hello" in next_prompt, "Prompt should include tool output"
|
||||
print(f" ✓ Tool result formatted for next prompt")
|
||||
|
||||
# Test 3: Verify loop detection
|
||||
print("\n3. Testing loop detection...")
|
||||
seen_tools = set()
|
||||
|
||||
# First tool call
|
||||
tc1 = [{"function": {"name": "bash", "arguments": '{"command": "ls"}'}}]
|
||||
sig1 = "bash:{'command': \"ls\"}'[:50]"
|
||||
seen_tools.add(sig1)
|
||||
print(f" ✓ First tool call tracked")
|
||||
|
||||
# Duplicate tool call
|
||||
tc2 = tc1
|
||||
sig2 = sig1
|
||||
is_duplicate = sig2 in seen_tools
|
||||
assert is_duplicate, "Should detect duplicate"
|
||||
print(f" ✓ Duplicate tool call detected")
|
||||
|
||||
# Test 4: Verify tool result truncation
|
||||
print("\n4. Testing tool result truncation...")
|
||||
long_result = "a" * 3000
|
||||
max_length = 2000
|
||||
|
||||
if len(long_result) > max_length:
|
||||
truncated = long_result[:max_length] + "\n[...truncated...]"
|
||||
assert len(truncated) == max_length + len("\n[...truncated...]"), "Should truncate properly"
|
||||
print(f" ✓ Tool result truncated from {len(long_result)} to {len(truncated)} chars")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("All end-to-end tests passed!")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(test_tool_flow())
|
||||
except AssertionError as e:
|
||||
print(f"\n❌ Test failed: {e}")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n❌ Test error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -0,0 +1,166 @@
|
||||
"""Tests for federation metrics and peer timeout."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
||||
|
||||
import pytest
|
||||
from datetime import datetime
|
||||
from network.discovery import PeerInfo, PeerMetrics
|
||||
from network.federation import FederationClient, PeerVote
|
||||
|
||||
|
||||
class TestPeerMetrics:
|
||||
"""Test peer metrics tracking."""
|
||||
|
||||
def test_peer_metrics_defaults(self):
|
||||
"""Test default metric values."""
|
||||
metrics = PeerMetrics()
|
||||
assert metrics.total_requests == 0
|
||||
assert metrics.successful_requests == 0
|
||||
assert metrics.failed_requests == 0
|
||||
assert metrics.success_rate == 1.0 # No requests = 100% success
|
||||
|
||||
def test_record_success(self):
|
||||
"""Test recording successful requests."""
|
||||
metrics = PeerMetrics()
|
||||
metrics.record_success(100.0)
|
||||
|
||||
assert metrics.total_requests == 1
|
||||
assert metrics.successful_requests == 1
|
||||
assert metrics.failed_requests == 0
|
||||
assert metrics.success_rate == 1.0
|
||||
assert metrics.avg_latency_ms == 100.0
|
||||
|
||||
# Record another success
|
||||
metrics.record_success(200.0)
|
||||
assert metrics.total_requests == 2
|
||||
assert metrics.avg_latency_ms == 150.0 # (100 + 200) / 2
|
||||
|
||||
def test_record_failure(self):
|
||||
"""Test recording failed requests."""
|
||||
metrics = PeerMetrics()
|
||||
metrics.record_failure("Connection timeout")
|
||||
|
||||
assert metrics.total_requests == 1
|
||||
assert metrics.successful_requests == 0
|
||||
assert metrics.failed_requests == 1
|
||||
assert metrics.success_rate == 0.0
|
||||
assert metrics.last_error == "Connection timeout"
|
||||
assert metrics.last_error_time is not None
|
||||
|
||||
def test_mixed_success_and_failure(self):
|
||||
"""Test mixed success and failure recording."""
|
||||
metrics = PeerMetrics()
|
||||
metrics.record_success(100.0)
|
||||
metrics.record_failure("Error")
|
||||
metrics.record_success(150.0)
|
||||
|
||||
assert metrics.total_requests == 3
|
||||
assert metrics.successful_requests == 2
|
||||
assert metrics.failed_requests == 1
|
||||
assert metrics.success_rate == 2/3
|
||||
|
||||
|
||||
class TestPeerInfo:
|
||||
"""Test PeerInfo with metrics and timeout."""
|
||||
|
||||
def test_peer_info_defaults(self):
|
||||
"""Test PeerInfo default values."""
|
||||
peer = PeerInfo(
|
||||
host="192.168.1.100",
|
||||
port=17615,
|
||||
name="test-peer",
|
||||
version="0.1.0",
|
||||
instances=2,
|
||||
model_id="qwen:7b:q4",
|
||||
hardware_summary="Apple M1 Pro",
|
||||
last_seen=datetime.now()
|
||||
)
|
||||
|
||||
assert peer.timeout_seconds == 60.0 # Default timeout
|
||||
assert peer.metrics is not None
|
||||
assert isinstance(peer.metrics, PeerMetrics)
|
||||
assert peer.api_url == "http://192.168.1.100:17615"
|
||||
|
||||
def test_peer_info_custom_timeout(self):
|
||||
"""Test PeerInfo with custom timeout."""
|
||||
peer = PeerInfo(
|
||||
host="192.168.1.100",
|
||||
port=17615,
|
||||
name="slow-peer",
|
||||
version="0.1.0",
|
||||
instances=1,
|
||||
model_id="test-model",
|
||||
hardware_summary="CPU only",
|
||||
last_seen=datetime.now(),
|
||||
timeout_seconds=120.0 # Custom timeout
|
||||
)
|
||||
|
||||
assert peer.timeout_seconds == 120.0
|
||||
|
||||
|
||||
class TestFederationClient:
|
||||
"""Test FederationClient with peer-specific timeouts."""
|
||||
|
||||
@pytest.fixture
|
||||
def client(self):
|
||||
return FederationClient(timeout=60.0)
|
||||
|
||||
@pytest.fixture
|
||||
def fast_peer(self):
|
||||
return PeerInfo(
|
||||
host="192.168.1.10",
|
||||
port=17615,
|
||||
name="fast-peer",
|
||||
version="0.1.0",
|
||||
instances=2,
|
||||
model_id="qwen:7b:q4",
|
||||
hardware_summary="Apple M1 Max",
|
||||
last_seen=datetime.now(),
|
||||
timeout_seconds=30.0 # Fast peer with short timeout
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def slow_peer(self):
|
||||
return PeerInfo(
|
||||
host="192.168.1.11",
|
||||
port=17615,
|
||||
name="slow-peer",
|
||||
version="0.1.0",
|
||||
instances=1,
|
||||
model_id="qwen:7b:q4",
|
||||
hardware_summary="CPU only",
|
||||
last_seen=datetime.now(),
|
||||
timeout_seconds=90.0 # Slow peer with longer timeout
|
||||
)
|
||||
|
||||
def test_peer_timeout_override(self, client, fast_peer, slow_peer):
|
||||
"""Test that peer-specific timeout overrides default."""
|
||||
# The client should use the peer's timeout, not the default
|
||||
assert fast_peer.timeout_seconds == 30.0
|
||||
assert slow_peer.timeout_seconds == 90.0
|
||||
assert client.timeout == 60.0 # Default unchanged
|
||||
|
||||
def test_metrics_updated_on_success(self, fast_peer):
|
||||
"""Test that metrics are updated on successful request."""
|
||||
assert fast_peer.metrics.total_requests == 0
|
||||
|
||||
# Simulate recording a success (this would happen in request_vote)
|
||||
fast_peer.metrics.record_success(150.0)
|
||||
|
||||
assert fast_peer.metrics.total_requests == 1
|
||||
assert fast_peer.metrics.successful_requests == 1
|
||||
assert fast_peer.metrics.success_rate == 1.0
|
||||
|
||||
def test_metrics_updated_on_failure(self, slow_peer):
|
||||
"""Test that metrics are updated on failed request."""
|
||||
assert slow_peer.metrics.total_requests == 0
|
||||
|
||||
# Simulate recording a failure
|
||||
slow_peer.metrics.record_failure("Connection refused")
|
||||
|
||||
assert slow_peer.metrics.total_requests == 1
|
||||
assert slow_peer.metrics.failed_requests == 1
|
||||
assert slow_peer.metrics.success_rate == 0.0
|
||||
assert slow_peer.metrics.last_error == "Connection refused"
|
||||
@@ -0,0 +1,176 @@
|
||||
"""Tests for hardware detection and GPU layer configuration."""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
||||
|
||||
import pytest
|
||||
from unittest.mock import Mock, patch, MagicMock
|
||||
from hardware.detector import (
|
||||
GPUInfo, HardwareProfile, detect_nvidia_gpu,
|
||||
calculate_gpu_layers, validate_gpu_layers, is_android
|
||||
)
|
||||
|
||||
|
||||
class TestNvidiaGPU:
|
||||
"""Test NVIDIA GPU detection."""
|
||||
|
||||
def test_detect_nvidia_gpu_success(self):
|
||||
"""Test successful NVIDIA GPU detection."""
|
||||
# Mock the entire import system
|
||||
mock_pynvml = Mock()
|
||||
mock_pynvml.nvmlInit = Mock()
|
||||
mock_pynvml.nvmlShutdown = Mock()
|
||||
mock_pynvml.nvmlDeviceGetCount = Mock(return_value=1)
|
||||
|
||||
# Mock device handle and info
|
||||
mock_handle = Mock()
|
||||
mock_pynvml.nvmlDeviceGetHandleByIndex = Mock(return_value=mock_handle)
|
||||
mock_pynvml.nvmlDeviceGetName = Mock(return_value="NVIDIA GeForce RTX 3080")
|
||||
|
||||
# Mock memory info
|
||||
mock_mem = Mock()
|
||||
mock_mem.total = 10737418240 # 10 GB
|
||||
mock_pynvml.nvmlDeviceGetMemoryInfo = Mock(return_value=mock_mem)
|
||||
|
||||
# Mock driver version
|
||||
mock_pynvml.nvmlSystemGetDriverVersion = Mock(return_value="535.104.05")
|
||||
|
||||
# Mock compute capability
|
||||
mock_pynvml.nvmlDeviceGetCudaComputeCapability = Mock(return_value=(8, 6))
|
||||
|
||||
# Patch __import__ to return our mock
|
||||
def mock_import(name, *args, **kwargs):
|
||||
if name == 'pynvml':
|
||||
return mock_pynvml
|
||||
return __builtins__.__import__(name, *args, **kwargs)
|
||||
|
||||
with patch('builtins.__import__', side_effect=mock_import):
|
||||
gpu = detect_nvidia_gpu()
|
||||
|
||||
assert gpu is not None
|
||||
assert gpu.name == "NVIDIA GeForce RTX 3080"
|
||||
assert gpu.vram_gb == 10.0
|
||||
assert gpu.driver_version == "535.104.05"
|
||||
assert gpu.is_nvidia is True
|
||||
assert gpu.compute_capability == "8.6"
|
||||
assert gpu.device_count == 1
|
||||
|
||||
def test_detect_nvidia_gpu_no_gpu(self):
|
||||
"""Test detection when no NVIDIA GPU present."""
|
||||
mock_pynvml = Mock()
|
||||
mock_pynvml.nvmlInit = Mock()
|
||||
mock_pynvml.nvmlShutdown = Mock()
|
||||
mock_pynvml.nvmlDeviceGetCount = Mock(return_value=0)
|
||||
|
||||
def mock_import(name, *args, **kwargs):
|
||||
if name == 'pynvml':
|
||||
return mock_pynvml
|
||||
return __builtins__.__import__(name, *args, **kwargs)
|
||||
|
||||
with patch('builtins.__import__', side_effect=mock_import):
|
||||
gpu = detect_nvidia_gpu()
|
||||
|
||||
assert gpu is None
|
||||
|
||||
def test_detect_nvidia_gpu_import_error(self):
|
||||
"""Test detection when pynvml is not installed."""
|
||||
def mock_import(name, *args, **kwargs):
|
||||
if name == 'pynvml':
|
||||
raise ImportError("No module named 'pynvml'")
|
||||
return __builtins__.__import__(name, *args, **kwargs)
|
||||
|
||||
with patch('builtins.__import__', side_effect=mock_import):
|
||||
gpu = detect_nvidia_gpu()
|
||||
|
||||
assert gpu is None
|
||||
|
||||
|
||||
class TestGPULayerCalculation:
|
||||
"""Test GPU layer auto-configuration."""
|
||||
|
||||
def test_calculate_gpu_layers_apple_silicon(self):
|
||||
"""Test layer calculation for Apple Silicon."""
|
||||
gpu = GPUInfo(
|
||||
name="Apple Silicon GPU",
|
||||
vram_gb=32.0,
|
||||
is_apple_silicon=True
|
||||
)
|
||||
assert calculate_gpu_layers(gpu) == -1
|
||||
|
||||
def test_calculate_gpu_layers_nvidia(self):
|
||||
"""Test layer calculation for NVIDIA GPU."""
|
||||
gpu = GPUInfo(
|
||||
name="NVIDIA GeForce RTX 3080",
|
||||
vram_gb=10.0,
|
||||
is_nvidia=True,
|
||||
compute_capability="8.6"
|
||||
)
|
||||
assert calculate_gpu_layers(gpu) == -1
|
||||
|
||||
def test_calculate_gpu_layers_old_nvidia(self):
|
||||
"""Test layer calculation for old NVIDIA GPU."""
|
||||
gpu = GPUInfo(
|
||||
name="NVIDIA GeForce GTX 680",
|
||||
vram_gb=2.0,
|
||||
is_nvidia=True,
|
||||
compute_capability="3.0"
|
||||
)
|
||||
assert calculate_gpu_layers(gpu) == 0 # Too old
|
||||
|
||||
def test_calculate_gpu_layers_no_gpu(self):
|
||||
"""Test layer calculation with no GPU."""
|
||||
assert calculate_gpu_layers(None) == 0
|
||||
|
||||
def test_validate_gpu_layers_success(self):
|
||||
"""Test successful layer validation."""
|
||||
gpu = GPUInfo(
|
||||
name="NVIDIA GeForce RTX 3080",
|
||||
vram_gb=10.0,
|
||||
is_nvidia=True,
|
||||
compute_capability="8.6"
|
||||
)
|
||||
assert validate_gpu_layers(-1, gpu) == -1
|
||||
|
||||
def test_validate_gpu_layers_no_gpu_error(self):
|
||||
"""Test validation error when GPU requested but none available."""
|
||||
with pytest.raises(ValueError, match="no GPU detected"):
|
||||
validate_gpu_layers(-1, None)
|
||||
|
||||
def test_validate_gpu_layers_old_gpu_error(self):
|
||||
"""Test validation error for unsupported GPU."""
|
||||
gpu = GPUInfo(
|
||||
name="NVIDIA GeForce GTX 680",
|
||||
vram_gb=2.0,
|
||||
is_nvidia=True,
|
||||
compute_capability="3.0"
|
||||
)
|
||||
with pytest.raises(ValueError, match="Minimum required is 5.0"):
|
||||
validate_gpu_layers(-1, gpu)
|
||||
|
||||
|
||||
class TestAndroidDetection:
|
||||
"""Test Android platform detection."""
|
||||
|
||||
@patch.dict('os.environ', {'ANDROID_ROOT': '/system'}, clear=True)
|
||||
@patch('os.path.exists')
|
||||
def test_is_android_env_var(self, mock_exists):
|
||||
"""Test Android detection via environment variables."""
|
||||
mock_exists.return_value = False
|
||||
assert is_android() is True
|
||||
|
||||
@patch.dict('os.environ', {}, clear=True)
|
||||
@patch('os.path.exists')
|
||||
def test_is_android_paths(self, mock_exists):
|
||||
"""Test Android detection via filesystem paths."""
|
||||
def exists_side_effect(path):
|
||||
return path == "/system/build.prop"
|
||||
mock_exists.side_effect = exists_side_effect
|
||||
assert is_android() is True
|
||||
|
||||
@patch.dict('os.environ', {}, clear=True)
|
||||
@patch('os.path.exists')
|
||||
def test_is_not_android(self, mock_exists):
|
||||
"""Test non-Android system."""
|
||||
mock_exists.return_value = False
|
||||
assert is_android() is False
|
||||
@@ -0,0 +1,183 @@
|
||||
"""Integration test for tool execution in chat completions.
|
||||
|
||||
This test verifies that:
|
||||
1. Tools are properly parsed from model output
|
||||
2. Tools are executed and results fed back to model
|
||||
3. The loop continues generating until final response
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
||||
|
||||
from api.models import ChatMessage
|
||||
from api.chat_handlers import handle_chat_completion, _sanitize_tools
|
||||
from api.tool_parser import parse_tool_calls
|
||||
from api.formatting import format_messages_with_tools
|
||||
|
||||
|
||||
class MockSwarm:
|
||||
"""Mock swarm manager for testing."""
|
||||
|
||||
async def generate(self, prompt, max_tokens, temperature, use_consensus):
|
||||
"""Generate a mock response."""
|
||||
# Return different responses based on prompt content
|
||||
if "tool_result" in prompt.lower():
|
||||
# Final response after tool execution
|
||||
return MockResponse("Here's the result: The tool was executed successfully!")
|
||||
else:
|
||||
# First response with tool call
|
||||
return MockResponse("TOOL: bash\nARGUMENTS: {\"command\": \"echo test\"}")
|
||||
|
||||
|
||||
class MockResponse:
|
||||
"""Mock generation result."""
|
||||
|
||||
def __init__(self, text):
|
||||
self.selected_response = MockSelectedResponse(text)
|
||||
|
||||
|
||||
class MockSelectedResponse:
|
||||
"""Mock selected response."""
|
||||
|
||||
def __init__(self, text):
|
||||
self.text = text
|
||||
self.tokens_generated = 50
|
||||
self.tokens_per_second = 10.0
|
||||
|
||||
|
||||
class MockExecutor:
|
||||
"""Mock tool executor."""
|
||||
|
||||
async def execute_tool(self, tool_name, tool_args, working_dir=None):
|
||||
"""Execute a tool mock."""
|
||||
return f"Mock result from {tool_name} with args {tool_args}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tool_execution_loop():
|
||||
"""Test that tools are executed and loop continues."""
|
||||
print("Testing tool execution loop...")
|
||||
|
||||
# Create a mock request
|
||||
request = ChatMessage(
|
||||
role="user",
|
||||
content="Run echo test"
|
||||
)
|
||||
|
||||
# Wrap in request object
|
||||
from api.models import ChatCompletionRequest
|
||||
req = ChatCompletionRequest(
|
||||
model="test-model",
|
||||
messages=[request],
|
||||
tools=None,
|
||||
max_tokens=1024,
|
||||
temperature=0.7
|
||||
)
|
||||
|
||||
# Create mock swarm
|
||||
swarm = MockSwarm()
|
||||
|
||||
# We can't easily test the full handler without a real tool executor,
|
||||
# so let's test the key parts
|
||||
|
||||
# Test 1: Verify tool parsing works
|
||||
print(" Test 1: Tool parsing")
|
||||
tool_text = 'TOOL: bash\nARGUMENTS: {"command": "echo test"}'
|
||||
content, tool_calls = parse_tool_calls(tool_text)
|
||||
|
||||
assert tool_calls is not None, "Tool calls should be parsed"
|
||||
assert len(tool_calls) == 1, "Should parse one tool call"
|
||||
assert tool_calls[0]["function"]["name"] == "bash", "Tool name should be bash"
|
||||
assert "echo test" in tool_calls[0]["function"]["arguments"], "Command should be in arguments"
|
||||
print(" ✓ Tool parsing works correctly")
|
||||
|
||||
# Test 2: Verify tool instructions are loaded
|
||||
print(" Test 2: Tool instructions")
|
||||
instructions = format_messages_with_tools([request], None)
|
||||
assert len(instructions) > 0, "Instructions should be generated"
|
||||
assert "tool" in instructions.lower(), "Instructions should mention tools"
|
||||
print(" ✓ Tool instructions are loaded")
|
||||
|
||||
# Test 3: Verify multiple tool calls can be parsed
|
||||
print(" Test 3: Multiple tool calls")
|
||||
multi_tool = '''TOOL: bash
|
||||
ARGUMENTS: {"command": "ls"}
|
||||
|
||||
TOOL: write
|
||||
ARGUMENTS: {"filePath": "test.txt", "content": "hello"}'''
|
||||
content, tool_calls = parse_tool_calls(multi_tool)
|
||||
assert tool_calls is not None, "Multiple tools should be parsed"
|
||||
assert len(tool_calls) == 2, "Should parse two tool calls"
|
||||
assert tool_calls[0]["function"]["name"] == "bash", "First tool should be bash"
|
||||
assert tool_calls[1]["function"]["name"] == "write", "Second tool should be write"
|
||||
print(" ✓ Multiple tool calls parsed correctly")
|
||||
|
||||
# Test 4: Verify tool sanitization
|
||||
print(" Test 4: Tool sanitization")
|
||||
# Create mock tool with invalid 'description' in properties
|
||||
from api.models import Tool, FunctionDefinition
|
||||
mock_tool = Tool(
|
||||
type="function",
|
||||
function=FunctionDefinition(
|
||||
name="test_tool",
|
||||
description="Test tool",
|
||||
parameters={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"description": "Invalid field",
|
||||
"param1": {"type": "string"}
|
||||
},
|
||||
"required": ["description", "param1"]
|
||||
}
|
||||
)
|
||||
)
|
||||
sanitized = _sanitize_tools([mock_tool])
|
||||
assert len(sanitized) == 1, "Should return one tool"
|
||||
assert "description" not in sanitized[0].function.parameters.get("properties", {}), \
|
||||
"Should remove invalid 'description' from properties"
|
||||
print(" ✓ Tool sanitization removes invalid fields")
|
||||
|
||||
print("\n✅ All tool execution loop tests passed!")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_tool_parsing():
|
||||
"""Test that normal responses without tools work."""
|
||||
print("\nTesting response without tools...")
|
||||
|
||||
# Test normal response
|
||||
normal_text = "This is a normal response without any tool calls."
|
||||
content, tool_calls = parse_tool_calls(normal_text)
|
||||
|
||||
assert tool_calls is None, "No tool calls should be found"
|
||||
assert content == normal_text, "Content should be returned unchanged"
|
||||
print(" ✓ Normal responses pass through without modification")
|
||||
|
||||
print("\n✅ No-tool parsing test passed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def run_tests():
|
||||
try:
|
||||
await test_tool_execution_loop()
|
||||
await test_no_tool_parsing()
|
||||
print("\n" + "=" * 60)
|
||||
print("All integration tests passed!")
|
||||
print("=" * 60)
|
||||
except AssertionError as e:
|
||||
print(f"\n❌ Test failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n❌ Test error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
asyncio.run(run_tests())
|
||||
@@ -133,7 +133,7 @@ ls -la
|
||||
|
||||
def test_tool_instructions_content():
|
||||
"""Test that tool instructions contain required sections (REVIEW-2026-02-24 Blocker #4)."""
|
||||
from api.routes import _load_tool_instructions
|
||||
from api.formatting import _load_tool_instructions
|
||||
|
||||
# Load instructions from config file
|
||||
instructions = _load_tool_instructions()
|
||||
@@ -147,7 +147,7 @@ def test_tool_instructions_content():
|
||||
|
||||
def test_tool_instructions_token_count():
|
||||
"""Test that tool instructions are within token budget (REVIEW-2026-02-24 Blocker #1)."""
|
||||
from api.routes import _load_tool_instructions
|
||||
from api.formatting import _load_tool_instructions
|
||||
|
||||
# Load instructions from config file
|
||||
instructions = _load_tool_instructions()
|
||||
|
||||
@@ -0,0 +1,105 @@
|
||||
"""Test to verify tool execution is triggered when model generates tool calls."""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tool_execution_triggered():
|
||||
"""Verify that tool execution is properly triggered."""
|
||||
|
||||
from api.models import ChatMessage, ChatCompletionRequest
|
||||
from api.chat_handlers import handle_chat_completion
|
||||
from api.tool_parser import parse_tool_calls
|
||||
from tools.executor import ToolExecutor, set_tool_executor
|
||||
|
||||
print("=" * 60)
|
||||
print("Tool Execution Trigger Test")
|
||||
print("=" * 60)
|
||||
|
||||
# Create a mock swarm that generates a tool call
|
||||
class MockSwarm:
|
||||
async def generate(self, prompt, max_tokens, temperature, use_consensus):
|
||||
# First call: generate tool call
|
||||
if "user" in prompt and "echo hello" in prompt:
|
||||
return MockResult("TOOL: bash\nARGUMENTS: {\"command\": \"echo hello\"}")
|
||||
# Second call: after tool result, generate answer
|
||||
elif "tool" in prompt.lower():
|
||||
return MockResult("Output: hello\nThe command executed successfully!")
|
||||
else:
|
||||
return MockResult("I don't understand")
|
||||
|
||||
class MockResult:
|
||||
def __init__(self, text):
|
||||
self.selected_response = MockSelectedResponse(text)
|
||||
|
||||
class MockSelectedResponse:
|
||||
def __init__(self, text):
|
||||
self.text = text
|
||||
self.tokens_generated = 20
|
||||
self.tokens_per_second = 5.0
|
||||
|
||||
# Set up tool executor
|
||||
executor = ToolExecutor(tool_host_url=None)
|
||||
set_tool_executor(executor)
|
||||
|
||||
# Create request
|
||||
request = ChatCompletionRequest(
|
||||
model="test-model",
|
||||
messages=[ChatMessage(role="user", content="echo hello")],
|
||||
tools=None, # No explicit tools - should still parse from response
|
||||
max_tokens=1024,
|
||||
temperature=0.7
|
||||
)
|
||||
|
||||
print("\n1. Testing that tool calls are parsed...")
|
||||
model_response = "TOOL: bash\nARGUMENTS: {\"command\": \"echo hello\"}"
|
||||
content, tool_calls = parse_tool_calls(model_response)
|
||||
|
||||
assert tool_calls is not None, "Tool calls should be parsed from response"
|
||||
assert len(tool_calls) == 1, "Should have one tool call"
|
||||
print(f" ✓ Tool call parsed: {tool_calls[0]['function']['name']}")
|
||||
|
||||
print("\n2. Verifying tool executor is set...")
|
||||
from tools.executor import get_tool_executor
|
||||
current_executor = get_tool_executor()
|
||||
assert current_executor is not None, "Tool executor should be set"
|
||||
print(f" ✓ Tool executor configured: {current_executor.tool_host_url or 'local'}")
|
||||
|
||||
print("\n3. Testing tool execution...")
|
||||
# Try to execute the tool
|
||||
try:
|
||||
from api.routes import execute_tool_server_side
|
||||
result = await execute_tool_server_side(
|
||||
"bash",
|
||||
{"command": "echo hello"},
|
||||
working_dir=None
|
||||
)
|
||||
print(f" ✓ Tool executed successfully")
|
||||
print(f" ✓ Result: {result[:50]}..." if len(result) > 50 else f" ✓ Result: {result}")
|
||||
except Exception as e:
|
||||
print(f" ✗ Tool execution failed: {e}")
|
||||
raise
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("All tool execution trigger tests passed!")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
asyncio.run(test_tool_execution_triggered())
|
||||
except AssertionError as e:
|
||||
print(f"\n❌ Test failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n❌ Test error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user