commit 8cf1e167031528f8a5a338b6dd2ebcdad5ff8a51 Author: Kaloyan Nikolov Date: Mon Feb 23 16:46:31 2026 +0100 Initial commit: Local Swarm project structure and documentation diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8b00c87 --- /dev/null +++ b/.gitignore @@ -0,0 +1,153 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +Pipfile.lock + +# poetry +poetry.lock + +# pdm +.pdm.toml + +# PEP 582 +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +.idea/ + +# VS Code +.vscode/ + +# Model cache +models/ +*.gguf +*.mlx +.cache/ + +# Local swarm specific +config.local.yaml +*.pid +logs/ diff --git a/PLAN.md b/PLAN.md new file mode 100644 index 0000000..c56e178 --- /dev/null +++ b/PLAN.md @@ -0,0 +1,574 @@ +# Local Swarm - Detailed Implementation Plan + +## Overview +A terminal-based tool that automatically configures and runs a swarm of small coding LLMs optimized for your hardware, exposing an OpenAI-compatible API for integration with opencode and other tools. + +## Architecture + +``` +local_swarm/ +├── src/ +│ ├── __init__.py +│ ├── hardware/ +│ │ ├── __init__.py +│ │ ├── detector.py # Platform-agnostic hardware detection +│ │ ├── nvidia.py # NVIDIA GPU detection (Windows/Linux) +│ │ ├── apple_silicon.py # Apple Silicon detection (macOS) +│ │ └── memory.py # RAM detection +│ ├── models/ +│ │ ├── __init__.py +│ │ ├── registry.py # Model database with specs +│ │ ├── selector.py # Optimal model/quant selection logic +│ │ └── downloader.py # Download manager (HuggingFace) +│ ├── backends/ +│ │ ├── __init__.py +│ │ ├── base.py # Backend interface +│ │ ├── llamacpp.py # llama.cpp backend +│ │ └── mlx.py # MLX backend (macOS) +│ ├── swarm/ +│ │ ├── __init__.py +│ │ ├── manager.py # Instance lifecycle management +│ │ ├── worker.py # Individual LLM instance wrapper +│ │ └── consensus.py # Voting/consensus algorithm +│ └── api/ +│ ├── __init__.py +│ ├── server.py # FastAPI/uvicorn server +│ ├── routes.py # OpenAI-compatible endpoints +│ └── middleware.py # Request handling +├── tests/ +├── config/ +│ └── models.yaml # Model configurations +├── scripts/ +│ ├── install.bat # Windows installer +│ └── install.sh # Unix installer +├── main.py # CLI entry point +├── requirements.txt +├── requirements-macos.txt # MLX-specific deps +├── setup.py +└── .gitignore +``` + +## Implementation Phases + +### Phase 1: Foundation (Week 1) + +#### 1.1 Hardware Detection Module +**File**: `src/hardware/detector.py` + +**Requirements**: +- Cross-platform OS detection (Windows, macOS, Linux) +- CPU info (cores, architecture) +- RAM detection (total, available) +- GPU detection with VRAM + +**Platform-specific implementations**: +- **Windows**: Use `pynvml` for NVIDIA, fallback to DirectX for others +- **macOS**: Use `psutil` for RAM, `sysctl` for CPU, Metal API for GPU +- **Linux**: Use `pynvml` for NVIDIA, `rocm-smi` for AMD + +**Output structure**: +```python +class HardwareProfile: + os: str # 'windows', 'darwin', 'linux' + cpu_cores: int + ram_gb: float + gpu: Optional[GPUInfo] + is_apple_silicon: bool +``` + +**Model selection rules**: +- External GPU (NVIDIA/AMD): Use 100% of VRAM +- Apple Silicon: Use 50% of unified RAM +- CPU-only: Use 50% of system RAM + +#### 1.2 Model Registry +**File**: `src/models/registry.py` + +**Model database** (YAML format): +```yaml +models: + qwen2.5-coder: + name: "Qwen 2.5 Coder" + description: "Alibaba's code-focused model" + variants: + - size: 3b + base_vram_gb: 2.0 # Approximate VRAM for fp16 + quantizations: + q4_k_m: + vram_gb: 1.8 + quality: "good" + q5_k_m: + vram_gb: 2.2 + quality: "better" + q6_k: + vram_gb: 2.6 + quality: "best" + - size: 7b + base_vram_gb: 14.0 + quantizations: + q4_k_m: + vram_gb: 4.5 + q5_k_m: + vram_gb: 5.2 + q6_k: + vram_gb: 6.0 + + codellama: + name: "CodeLlama" + # Similar structure... + + deepseek-coder: + name: "DeepSeek Coder" + # Similar structure... +``` + +**Selection priority**: +1. Qwen 2.5 Coder (best for small sizes) +2. DeepSeek Coder (good alternative) +3. CodeLlama (fallback) + +#### 1.3 Model Selector Logic +**File**: `src/models/selector.py` + +**Algorithm**: +```python +def select_optimal_model(hardware: HardwareProfile) -> ModelConfig: + available_memory = get_available_memory(hardware) + + # Try models in priority order + for model in PRIORITY_MODELS: + # Find largest size that fits + for variant in reversed(model.variants): + # Try highest quantization that fits + for quant in reversed(variant.quantizations): + total_vram_needed = quant.vram_gb * MIN_INSTANCES + if total_vram_needed <= available_memory: + # Calculate max instances + max_instances = int(available_memory // quant.vram_gb) + # Cap at reasonable limit (e.g., 8) + instances = min(max_instances, 8) + return ModelConfig(model, variant, quant, instances) + + # Fallback to smallest model + return FALLBACK_CONFIG +``` + +**Minimum instances**: 2 (for consensus voting) +**Maximum instances**: 8 (to avoid overhead) + +### Phase 2: Backend Integration (Week 2) + +#### 2.1 Base Backend Interface +**File**: `src/backends/base.py` + +```python +from abc import ABC, abstractmethod +from typing import AsyncIterator + +class LLMBackend(ABC): + @abstractmethod + async def load_model(self, model_path: str, config: dict) -> bool: + pass + + @abstractmethod + async def generate(self, prompt: str, **kwargs) -> str: + pass + + @abstractmethod + async def generate_stream(self, prompt: str, **kwargs) -> AsyncIterator[str]: + pass + + @abstractmethod + def get_memory_usage(self) -> float: + """Return current VRAM/RAM usage in GB""" + pass + + @abstractmethod + def shutdown(self): + pass +``` + +#### 2.2 llama.cpp Backend +**File**: `src/backends/llamacpp.py` + +**Implementation**: +- Use `llama-cpp-python` library +- Support GGUF model format +- GPU acceleration via CUDA/Metal +- Server mode with HTTP API + +**Key features**: +- Model caching to avoid reload +- Context window management +- Batch processing support + +**Memory calculation**: +```python +def calculate_memory_usage(model_path: str) -> float: + # Parse GGUF metadata + # Return estimated VRAM usage +``` + +#### 2.3 MLX Backend (macOS) +**File**: `src/backends/mlx.py` + +**Implementation**: +- Use `mlx-lm` library +- Support MLX format models +- Optimized for Apple Silicon + +**Key differences from llama.cpp**: +- Native Metal performance +- Simpler API +- Unified memory model + +### Phase 3: Swarm Management (Week 3) + +#### 3.1 Worker Instance +**File**: `src/swarm/worker.py` + +Each worker manages: +- One LLM instance +- Request queue +- Health monitoring +- Metrics collection + +```python +class SwarmWorker: + def __init__(self, worker_id: int, backend: LLMBackend, config: dict): + self.worker_id = worker_id + self.backend = backend + self.is_healthy = True + self.request_count = 0 + self.avg_latency = 0.0 + + async def process(self, request: GenerationRequest) -> GenerationResponse: + start = time.time() + response = await self.backend.generate(**request.params) + latency = time.time() - start + self._update_metrics(latency) + return GenerationResponse(response, latency, self.worker_id) +``` + +#### 3.2 Swarm Manager +**File**: `src/swarm/manager.py` + +Responsibilities: +- Spawn N workers based on hardware +- Distribute requests to all workers +- Collect responses +- Handle worker failures + +```python +class SwarmManager: + def __init__(self, config: ModelConfig): + self.workers: List[SwarmWorker] = [] + self.config = config + + async def initialize(self): + # Download model if needed + model_path = await self._ensure_model() + + # Spawn workers + for i in range(self.config.instances): + backend = self._create_backend() + await backend.load_model(model_path, self.config.backend_params) + worker = SwarmWorker(i, backend, self.config) + self.workers.append(worker) + + async def generate_all(self, prompt: str, **kwargs) -> List[GenerationResponse]: + # Send to all workers in parallel + tasks = [w.process(request) for w in self.workers] + return await asyncio.gather(*tasks) +``` + +#### 3.3 Consensus Algorithm +**File**: `src/swarm/consensus.py` + +**Voting strategies**: + +1. **Similarity voting** (default): + - Embed all responses + - Group by semantic similarity + - Return largest group + +2. **Quality scoring**: + - Score each response on: + - Completeness (does it answer the question?) + - Code quality (syntax, structure) + - Length appropriateness + - Return highest score + +3. **Latency-weighted**: + - Prefer faster responses (lower memory pressure) + +**Implementation**: +```python +class ConsensusEngine: + def __init__(self, strategy: str = "similarity"): + self.strategy = strategy + self.embedding_model = None # Lazy load + + async def select_best(self, responses: List[GenerationResponse]) -> str: + if len(responses) == 1: + return responses[0].text + + if self.strategy == "similarity": + return await self._similarity_vote(responses) + elif self.strategy == "quality": + return await self._quality_score(responses) + else: + return self._fastest_response(responses) + + async def _similarity_vote(self, responses: List[GenerationResponse]) -> str: + # Use sentence-transformers for embeddings + # Group by cosine similarity > 0.85 + # Return median response from largest group +``` + +### Phase 4: API Server (Week 4) + +#### 4.1 OpenAI-Compatible Endpoints +**File**: `src/api/routes.py` + +Required endpoints: +- `GET /v1/models` - List available models +- `POST /v1/chat/completions` - Chat completion +- `POST /v1/completions` - Text completion (optional) +- `GET /health` - Health check +- `GET /metrics` - Prometheus metrics (optional) + +**Chat completions endpoint**: +```python +@app.post("/v1/chat/completions") +async def chat_completions(request: ChatCompletionRequest): + # Extract messages + messages = request.messages + prompt = format_messages(messages) + + # Get all responses from swarm + responses = await swarm_manager.generate_all(prompt, **request.params) + + # Run consensus + best_response = await consensus_engine.select_best(responses) + + # Format as OpenAI response + return { + "id": f"chatcmpl-{uuid4()}", + "object": "chat.completion", + "created": int(time.time()), + "model": request.model, + "choices": [{ + "index": 0, + "message": {"role": "assistant", "content": best_response}, + "finish_reason": "stop" + }], + "usage": calculate_usage(prompt, best_response) + } +``` + +#### 4.2 Streaming Support +**File**: `src/api/routes.py` + +For streaming, use the fastest worker instead of consensus: +```python +if request.stream: + # Pick worker with lowest latency + worker = swarm_manager.get_fastest_worker() + return StreamingResponse( + worker.stream_generate(prompt), + media_type="text/event-stream" + ) +``` + +### Phase 5: CLI & Distribution (Week 5) + +#### 5.1 CLI Interface +**File**: `main.py` + +Commands: +```bash +# Start the swarm (auto-detect hardware) +python -m local_swarm + +# Start with specific model +python -m local_swarm --model qwen2.5-coder:3b:q4 + +# Start with specific port +python -m local_swarm --port 8080 + +# Override instance count +python -m local_swarm --instances 4 + +# Show hardware detection +python -m local_swarm --detect + +# Download models only +python -m local_swarm --download-only +``` + +#### 5.2 Configuration File +**File**: `config.yaml` + +```yaml +server: + host: "127.0.0.1" + port: 8000 + +swarm: + consensus_strategy: "similarity" # similarity, quality, fastest + min_instances: 2 + max_instances: 8 + timeout: 60 + +models: + cache_dir: "~/.local_swarm/models" + preferred_models: + - qwen2.5-coder + - deepseek-coder + - codellama + +hardware: + gpu_memory_fraction: 1.0 # Use 100% of GPU VRAM + ram_fraction: 0.5 # Use 50% of system RAM for CPU/Apple Silicon +``` + +#### 5.3 Installation Scripts + +**Windows** (`scripts/install.bat`): +```batch +@echo off +echo Installing Local Swarm... +python -m pip install --upgrade pip +pip install -r requirements.txt + +:: Check for CUDA +nvidia-smi >nul 2>&1 +if %errorlevel% == 0 ( + echo CUDA detected, installing GPU support... + pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121 +) else ( + echo No CUDA detected, using CPU backend... + pip install llama-cpp-python +) + +echo Installation complete! +echo Run: python -m local_swarm +``` + +**macOS/Linux** (`scripts/install.sh`): +```bash +#!/bin/bash +set -e + +echo "Installing Local Swarm..." +pip install --upgrade pip + +# Detect platform +if [[ "$OSTYPE" == "darwin"* ]]; then + echo "macOS detected..." + pip install -r requirements.txt + pip install -r requirements-macos.txt +elif [[ "$OSTYPE" == "linux-gnu"* ]]; then + echo "Linux detected..." + pip install -r requirements.txt + if command -v nvidia-smi &> /dev/null; then + echo "CUDA detected, installing GPU support..." + pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121 + else + pip install llama-cpp-python + fi +fi + +echo "Installation complete!" +echo "Run: python -m local_swarm" +``` + +### Phase 6: Testing & Polish (Week 6) + +#### 6.1 Test Coverage + +**Unit tests**: +- Hardware detection mocking +- Model selection logic +- Consensus algorithm +- API endpoint validation + +**Integration tests**: +- End-to-end inference +- Multi-worker coordination +- Error handling + +**Platform tests**: +- Windows with NVIDIA +- macOS with M1/M2/M3 +- Linux with CUDA +- CPU-only fallback + +#### 6.2 Performance Optimization + +- **Model warmup**: Pre-load models on startup +- **Request batching**: Group similar requests +- **Worker pooling**: Reuse workers instead of respawning +- **Memory monitoring**: Auto-shutdown if OOM + +#### 6.3 Documentation + +- API documentation (OpenAPI spec) +- Configuration guide +- Troubleshooting +- Performance tuning tips + +## Technical Decisions + +### Why llama.cpp? +- Best cross-platform support +- Mature quantization formats (GGUF) +- Active community +- Good performance/quality tradeoff + +### Why MLX for macOS? +- Native Apple Silicon optimization +- Simpler than llama.cpp on macOS +- Better unified memory handling + +### Why consensus voting? +- Improves response quality vs single model +- Uses available hardware efficiently +- Can detect model hallucinations + +### Memory Model + +**External GPU (NVIDIA/AMD)**: +- Use 100% of VRAM +- Keep 10% buffer for OS/drivers +- Each instance gets equal share + +**Apple Silicon**: +- Use 50% of unified RAM +- Avoid system swap +- Monitor memory pressure + +**CPU-only**: +- Use 50% of system RAM +- Dependent on available memory +- Slower but functional + +## Future Enhancements + +1. **Multi-GPU support**: Distribute across multiple GPUs +2. **Dynamic scaling**: Add/remove workers based on load +3. **Model mixing**: Different models in same swarm +4. **Fine-tuning**: Local fine-tuning on user data +5. **Web UI**: Browser-based configuration +6. **Docker support**: Containerized deployment +7. **Cloud inference**: Fallback to cloud APIs + +## Success Metrics + +- **Startup time**: < 30 seconds from cold start +- **First inference**: < 10 seconds after startup +- **Concurrent requests**: Support 2-8 parallel inferences +- **Consensus accuracy**: > 80% agreement on code tasks +- **Memory efficiency**: Use > 80% of available memory +- **Cross-platform**: Works on Windows/macOS/Linux without code changes diff --git a/README.md b/README.md new file mode 100644 index 0000000..dcf3fc9 --- /dev/null +++ b/README.md @@ -0,0 +1,352 @@ +# Local Swarm + +Automatically configure and run a swarm of small coding LLMs optimized for your hardware. Provides an OpenAI-compatible API for seamless integration with opencode and other tools. + +## Features + +- **Hardware Auto-Detection**: Automatically detects your GPU (NVIDIA), Apple Silicon, or CPU and selects optimal settings +- **Smart Model Selection**: Chooses the best model, quantization, and instance count based on available VRAM/RAM +- **Swarm Consensus**: Multiple LLM instances vote on the best response for higher quality outputs +- **OpenAI-Compatible API**: Drop-in replacement for OpenAI API at `http://localhost:8000/v1` +- **Cross-Platform**: Works on Windows, macOS, and Linux with automatic backend selection + +## Quick Start + +### Installation + +#### Windows (PowerShell) +```powershell +# Clone the repository +git clone https://github.com/yourusername/local_swarm.git +cd local_swarm + +# Run installer +.\scripts\install.bat +``` + +#### macOS/Linux +```bash +# Clone the repository +git clone https://github.com/yourusername/local_swarm.git +cd local_swarm + +# Run installer +chmod +x scripts/install.sh +./scripts/install.sh +``` + +### Usage + +#### Start the Swarm +```bash +# Auto-detect hardware and start +python -m local_swarm + +# Or use the CLI +python main.py +``` + +On first run, the tool will: +1. Scan your hardware (GPU, RAM, CPU) +2. Select the optimal model and quantization +3. Download the model (one-time) +4. Start multiple instances based on available memory +5. Expose the API at `http://localhost:8000` + +Example startup output: +``` +🔍 Detecting hardware... + OS: Windows 11 + GPU: NVIDIA GeForce RTX 4060 Ti (16 GB VRAM) + CPU: 16 cores + RAM: 32 GB + +📊 Optimal configuration: + Model: Qwen 2.5 Coder 3B + Quantization: Q4_K_M (1.8 GB per instance) + Instances: 8 (using 14.4 GB VRAM) + +⬇️ Downloading model... + Progress: 100% ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ 1.8/1.8 GB + +🚀 Starting swarm... + Worker 1: Ready (GPU:0) + Worker 2: Ready (GPU:0) + ... + Worker 8: Ready (GPU:0) + +✅ Local Swarm is running! + API: http://localhost:8000/v1 + Models: http://localhost:8000/v1/models + Health: http://localhost:8000/health + +💡 Configure opencode to use: + base_url: http://localhost:8000/v1 + api_key: any (not used) +``` + +#### Configure opencode + +Add to your opencode configuration: + +```json +{ + "model": { + "provider": "openai", + "base_url": "http://localhost:8000/v1", + "api_key": "not-needed", + "model": "local-swarm" + } +} +``` + +## Configuration + +Create a `config.yaml` file for customization: + +```yaml +server: + host: "127.0.0.1" + port: 8000 + +swarm: + consensus_strategy: "similarity" # similarity, quality, fastest + min_instances: 2 + max_instances: 8 + +hardware: + gpu_memory_fraction: 1.0 # Use 100% of GPU VRAM + ram_fraction: 0.5 # Use 50% of system RAM for CPU/Apple Silicon + +models: + cache_dir: "~/.local_swarm/models" +``` + +## CLI Options + +```bash +# Show hardware detection without starting +python -m local_swarm --detect + +# Use specific model +python -m local_swarm --model qwen2.5-coder:3b:q4 + +# Use specific port +python -m local_swarm --port 8080 + +# Force number of instances +python -m local_swarm --instances 4 + +# Download models only (no server) +python -m local_swarm --download-only + +# Show help +python -m local_swarm --help +``` + +## How It Works + +### Hardware Detection + +The tool automatically detects your system: +- **Windows**: NVIDIA GPUs via NVML, DirectX fallback +- **macOS**: Apple Silicon via Metal, unified memory model +- **Linux**: NVIDIA (NVML), AMD (ROCm) + +### Model Selection + +Based on available memory: +1. **External GPU**: Use 100% of VRAM minus OS overhead +2. **Apple Silicon**: Use 50% of unified RAM +3. **CPU-only**: Use 50% of system RAM + +The algorithm selects: +- Largest model size that fits +- Highest quantization quality possible +- Maximum instances (2-8) based on memory + +Example configurations: + +| Hardware | Model | Quant | Instances | Memory Used | +|----------|-------|-------|-----------|-------------| +| RTX 4060 Ti 16GB | Qwen 2.5 7B | Q4_K_M | 3 | ~13.5 GB | +| RTX 4060 Ti 8GB | Qwen 2.5 3B | Q6_K | 4 | ~10.4 GB | +| M3 Pro 36GB | Qwen 2.5 7B | Q4_K_M | 4 | ~18 GB | +| M1 8GB | Qwen 2.5 3B | Q4_K_M | 2 | ~3.6 GB | +| CPU 32GB | Qwen 2.5 3B | Q4_K_M | 8 | ~14.4 GB | + +### Swarm Consensus + +For each request, the swarm: +1. Sends the prompt to all running instances +2. Collects responses in parallel +3. Runs consensus algorithm: + - **Similarity**: Groups responses by semantic similarity, returns largest group + - **Quality**: Scores responses on completeness and code quality + - **Fastest**: Returns the quickest response +4. Returns the winning response via OpenAI-compatible API + +## API Endpoints + +### GET /v1/models +List available models + +### POST /v1/chat/completions +Chat completion with consensus + +**Request**: +```json +{ + "model": "local-swarm", + "messages": [ + {"role": "user", "content": "Write a Python function to sort a list"} + ] +} +``` + +**Response**: +```json +{ + "id": "chatcmpl-abc123", + "object": "chat.completion", + "created": 1234567890, + "model": "local-swarm", + "choices": [{ + "index": 0, + "message": { + "role": "assistant", + "content": "def sort_list(lst):\n return sorted(lst)" + }, + "finish_reason": "stop" + }] +} +``` + +### GET /health +Health check + +### GET /metrics +Prometheus metrics (optional) + +## Supported Models + +Currently supported models (auto-selected based on hardware): + +- **Qwen 2.5 Coder** (3B, 7B, 14B) - Recommended for coding tasks +- **DeepSeek Coder** (1.3B, 6.7B, 33B) - Good alternative +- **CodeLlama** (7B, 13B, 34B) - Meta's code model + +All models support GGUF quantization: +- Q4_K_M - Good quality, smallest size (recommended) +- Q5_K_M - Better quality +- Q6_K - Best quality + +## Troubleshooting + +### Out of Memory +If you get OOM errors: +```bash +# Reduce instances +python -m local_swarm --instances 2 + +# Or use smaller model +python -m local_swarm --model qwen2.5-coder:3b:q4 +``` + +### Slow Performance +- Check GPU utilization with `nvidia-smi` (NVIDIA) or Activity Monitor (macOS) +- Ensure model is cached (first run downloads to `~/.local_swarm/models`) +- Try reducing instances to avoid contention + +### Windows: CUDA not detected +Make sure NVIDIA drivers are installed: +```powershell +nvidia-smi +``` +If this fails, reinstall drivers from nvidia.com + +### macOS: MLX not found +```bash +pip install mlx-lm +``` + +## Requirements + +- Python 3.9+ +- 4GB+ RAM (8GB+ recommended) +- Optional: NVIDIA GPU with 4GB+ VRAM +- Optional: Apple Silicon Mac + +## Development + +```bash +# Install dev dependencies +pip install -r requirements-dev.txt + +# Run tests +pytest + +# Run specific platform tests +pytest tests/test_hardware.py -v + +# Format code +black src/ +ruff check src/ +``` + +## Architecture + +``` +┌─────────────────────────────────────┐ +│ OpenAI API Client │ +│ (opencode, etc.) │ +└─────────────┬───────────────────────┘ + │ HTTP + ▼ +┌─────────────────────────────────────┐ +│ Local Swarm API Server │ +│ (FastAPI / localhost:8000) │ +└─────────────┬───────────────────────┘ + │ + ▼ +┌─────────────────────────────────────┐ +│ Swarm Manager │ +│ ┌─────────┐ ┌─────────┐ │ +│ │ Worker 1│ │ Worker 2│ ... │ +│ │(LLM #1) │ │(LLM #2) │ │ +│ └────┬────┘ └────┬────┘ │ +│ │ │ │ +│ └─────┬─────┘ │ +│ ▼ │ +│ Consensus Engine │ +└─────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────┐ +│ Backend (llama.cpp / MLX) │ +│ ┌─────────────────────┐ │ +│ │ GGUF/MLX Model │ │ +│ │ (Qwen/Codellama) │ │ +│ └─────────────────────┘ │ +└─────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────┐ +│ Hardware (GPU/CPU/Apple Silicon) │ +└─────────────────────────────────────┘ +``` + +## License + +MIT License - See LICENSE file + +## Contributing + +Contributions welcome! Please read CONTRIBUTING.md first. + +## Acknowledgments + +- [llama.cpp](https://github.com/ggerganov/llama.cpp) - Inference engine +- [MLX](https://github.com/ml-explore/mlx) - Apple Silicon backend +- [Qwen](https://github.com/QwenLM/Qwen) - Model family +- [HuggingFace](https://huggingface.co) - Model hosting diff --git a/main.py b/main.py new file mode 100644 index 0000000..ca5d3a4 --- /dev/null +++ b/main.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +""" +Local Swarm - Automatically configure and run a swarm of small coding LLMs +""" + +import argparse +import sys +from pathlib import Path + +# Add src to path +sys.path.insert(0, str(Path(__file__).parent / "src")) + +from rich.console import Console +from rich.panel import Panel + +console = Console() + + +def main(): + parser = argparse.ArgumentParser( + description="Local Swarm - AI-powered coding LLM swarm", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python main.py # Auto-detect and start + python main.py --detect # Show hardware detection only + python main.py --model qwen:3b:q4 # Use specific model + python main.py --port 8080 # Use custom port + python main.py --instances 4 # Force 4 instances + """ + ) + + parser.add_argument( + "--detect", + action="store_true", + help="Show hardware detection and exit" + ) + parser.add_argument( + "--model", + type=str, + help="Model to use (format: name:size:quant, e.g., qwen:3b:q4)" + ) + parser.add_argument( + "--port", + type=int, + default=8000, + help="Port to run the API server on (default: 8000)" + ) + parser.add_argument( + "--instances", + type=int, + help="Force number of instances (overrides auto-calculation)" + ) + parser.add_argument( + "--download-only", + action="store_true", + help="Download models only, don't start server" + ) + parser.add_argument( + "--config", + type=str, + default="config.yaml", + help="Path to config file" + ) + parser.add_argument( + "--version", + action="version", + version="%(prog)s 0.1.0" + ) + + args = parser.parse_args() + + # Show welcome + console.print(Panel.fit( + "[bold blue]Local Swarm[/bold blue] - AI-powered coding LLM swarm\n" + "Automatically configures optimal LLM setup for your hardware", + title="Welcome", + border_style="blue" + )) + + if args.detect: + console.print("[yellow]Hardware detection mode - not yet implemented[/yellow]") + console.print("Run without --detect to start the swarm (once implemented)") + return + + console.print("[green]Starting Local Swarm...[/green]") + console.print("[dim]Note: This is a placeholder. Implementation in progress.[/dim]") + console.print() + console.print("[bold]Next steps:[/bold]") + console.print("1. Check PLAN.md for implementation details") + console.print("2. Start implementing src/hardware/detector.py") + console.print("3. Continue with other modules") + + +if __name__ == "__main__": + main() diff --git a/requirements-macos.txt b/requirements-macos.txt new file mode 100644 index 0000000..a22811a --- /dev/null +++ b/requirements-macos.txt @@ -0,0 +1,3 @@ +# macOS specific dependencies +mlx>=0.15.0 +mlx-lm>=0.8.0 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3acf8e6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,28 @@ +# Core dependencies +pydantic>=2.0.0 +pyyaml>=6.0 +requests>=2.31.0 +tqdm>=4.65.0 +psutil>=5.9.0 + +# API server +fastapi>=0.104.0 +uvicorn[standard]>=0.24.0 + +# Hardware detection +pynvml>=11.5.0 + +# ML/Embeddings (for consensus) +sentence-transformers>=2.2.0 +numpy>=1.24.0 + +# llama.cpp (CPU version, GPU version installed via scripts) +llama-cpp-python>=0.2.0 + +# Async +aiohttp>=3.9.0 +asyncio>=3.4.3 + +# CLI +click>=8.1.0 +rich>=13.0.0 diff --git a/scripts/install.bat b/scripts/install.bat new file mode 100644 index 0000000..2ea39f2 --- /dev/null +++ b/scripts/install.bat @@ -0,0 +1,55 @@ +@echo off +echo ========================================== +echo Local Swarm - Windows Installer +echo ========================================== +echo. + +REM Check Python +python --version >nul 2>&1 +if errorlevel 1 ( + echo [ERROR] Python is not installed or not in PATH + echo Please install Python 3.9+ from https://python.org + exit /b 1 +) + +echo [1/4] Checking Python version... +for /f "tokens=2" %%a in ('python --version') do set PYTHON_VERSION=%%a +echo Found Python %PYTHON_VERSION% + +echo. +echo [2/4] Upgrading pip... +python -m pip install --upgrade pip + +echo. +echo [3/4] Installing base dependencies... +pip install -r requirements.txt + +REM Check for CUDA +nvidia-smi >nul 2>&1 +if %errorlevel% == 0 ( + echo. + echo [4/4] CUDA detected! Installing GPU-accelerated llama.cpp... + pip uninstall -y llama-cpp-python + pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121 + echo GPU support enabled! +) else ( + echo. + echo [4/4] No CUDA detected, using CPU backend... + echo CPU-only mode (slower but works on any hardware) +) + +echo. +echo ========================================== +echo Installation Complete! +echo ========================================== +echo. +echo To start Local Swarm: +echo python main.py +echo. +echo To check hardware detection: +echo python main.py --detect +echo. +echo For more options: +echo python main.py --help +echo. +pause diff --git a/scripts/install.sh b/scripts/install.sh new file mode 100755 index 0000000..2b14aae --- /dev/null +++ b/scripts/install.sh @@ -0,0 +1,98 @@ +#!/bin/bash +set -e + +echo "==========================================" +echo " Local Swarm - Installer" +echo "==========================================" +echo + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Check Python +if ! command -v python3 &> /dev/null; then + echo -e "${RED}[ERROR] Python 3 is not installed${NC}" + echo "Please install Python 3.9+ and try again" + exit 1 +fi + +echo "[1/4] Checking Python version..." +PYTHON_VERSION=$(python3 --version | cut -d' ' -f2) +echo " Found Python $PYTHON_VERSION" + +echo +echo "[2/4] Upgrading pip..." +python3 -m pip install --upgrade pip + +echo +echo "[3/4] Installing base dependencies..." +pip3 install -r requirements.txt + +# Detect platform and install appropriate backend +echo +echo "[4/4] Detecting hardware and installing backend..." + +if [[ "$OSTYPE" == "darwin"* ]]; then + # macOS + echo " Platform: macOS" + + # Check for Apple Silicon + if [[ $(uname -m) == "arm64" ]]; then + echo " Hardware: Apple Silicon detected!" + echo " Installing MLX backend..." + pip3 install -r requirements-macos.txt + echo " ${GREEN}MLX backend installed!${NC}" + else + echo " Hardware: Intel Mac" + echo " Installing llama.cpp (CPU)..." + pip3 install llama-cpp-python + echo " ${GREEN}llama.cpp installed (CPU mode)${NC}" + fi + +elif [[ "$OSTYPE" == "linux-gnu"* ]]; then + # Linux + echo " Platform: Linux" + + # Check for NVIDIA GPU + if command -v nvidia-smi &> /dev/null; then + echo " Hardware: NVIDIA GPU detected!" + echo " Installing CUDA-enabled llama.cpp..." + pip3 uninstall -y llama-cpp-python 2>/dev/null || true + pip3 install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121 + echo " ${GREEN}GPU support enabled!${NC}" + else + echo " Hardware: No NVIDIA GPU detected" + echo " Installing llama.cpp (CPU)..." + pip3 install llama-cpp-python + echo " ${GREEN}CPU backend installed${NC}" + fi + + # Check for AMD GPU (ROCm) + if command -v rocm-smi &> /dev/null; then + echo -e "${YELLOW}[WARNING] AMD GPU detected but ROCm support is experimental${NC}" + echo " Using CPU backend for now" + fi + +else + echo -e "${YELLOW}[WARNING] Unknown platform: $OSTYPE${NC}" + echo " Installing generic CPU backend..." + pip3 install llama-cpp-python +fi + +echo +echo "==========================================" +echo " Installation Complete!" +echo "==========================================" +echo +echo "To start Local Swarm:" +echo " python3 main.py" +echo +echo "To check hardware detection:" +echo " python3 main.py --detect" +echo +echo "For more options:" +echo " python3 main.py --help" +echo diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..a746274 --- /dev/null +++ b/setup.py @@ -0,0 +1,48 @@ +from setuptools import setup, find_packages + +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +with open("requirements.txt", "r", encoding="utf-8") as fh: + requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")] + +setup( + name="local-swarm", + version="0.1.0", + author="Local Swarm Contributors", + description="Automatically configure and run a swarm of small coding LLMs", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/yourusername/local_swarm", + packages=find_packages(where="src"), + package_dir={"": "src"}, + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + ], + python_requires=">=3.9", + install_requires=requirements, + extras_require={ + "macos": ["mlx>=0.15.0", "mlx-lm>=0.8.0"], + "dev": [ + "pytest>=7.4.0", + "pytest-asyncio>=0.21.0", + "black>=23.0.0", + "ruff>=0.1.0", + "mypy>=1.6.0", + ], + }, + entry_points={ + "console_scripts": [ + "local-swarm=main:main", + ], + }, +) diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/api/__init__.py b/src/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/backends/__init__.py b/src/backends/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/hardware/__init__.py b/src/hardware/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/swarm/__init__.py b/src/swarm/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29