Initial commit: Local Swarm project structure and documentation
This commit is contained in:
+153
@@ -0,0 +1,153 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
.python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
.pdm.toml
|
||||||
|
|
||||||
|
# PEP 582
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
.idea/
|
||||||
|
|
||||||
|
# VS Code
|
||||||
|
.vscode/
|
||||||
|
|
||||||
|
# Model cache
|
||||||
|
models/
|
||||||
|
*.gguf
|
||||||
|
*.mlx
|
||||||
|
.cache/
|
||||||
|
|
||||||
|
# Local swarm specific
|
||||||
|
config.local.yaml
|
||||||
|
*.pid
|
||||||
|
logs/
|
||||||
@@ -0,0 +1,574 @@
|
|||||||
|
# Local Swarm - Detailed Implementation Plan
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
A terminal-based tool that automatically configures and runs a swarm of small coding LLMs optimized for your hardware, exposing an OpenAI-compatible API for integration with opencode and other tools.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
local_swarm/
|
||||||
|
├── src/
|
||||||
|
│ ├── __init__.py
|
||||||
|
│ ├── hardware/
|
||||||
|
│ │ ├── __init__.py
|
||||||
|
│ │ ├── detector.py # Platform-agnostic hardware detection
|
||||||
|
│ │ ├── nvidia.py # NVIDIA GPU detection (Windows/Linux)
|
||||||
|
│ │ ├── apple_silicon.py # Apple Silicon detection (macOS)
|
||||||
|
│ │ └── memory.py # RAM detection
|
||||||
|
│ ├── models/
|
||||||
|
│ │ ├── __init__.py
|
||||||
|
│ │ ├── registry.py # Model database with specs
|
||||||
|
│ │ ├── selector.py # Optimal model/quant selection logic
|
||||||
|
│ │ └── downloader.py # Download manager (HuggingFace)
|
||||||
|
│ ├── backends/
|
||||||
|
│ │ ├── __init__.py
|
||||||
|
│ │ ├── base.py # Backend interface
|
||||||
|
│ │ ├── llamacpp.py # llama.cpp backend
|
||||||
|
│ │ └── mlx.py # MLX backend (macOS)
|
||||||
|
│ ├── swarm/
|
||||||
|
│ │ ├── __init__.py
|
||||||
|
│ │ ├── manager.py # Instance lifecycle management
|
||||||
|
│ │ ├── worker.py # Individual LLM instance wrapper
|
||||||
|
│ │ └── consensus.py # Voting/consensus algorithm
|
||||||
|
│ └── api/
|
||||||
|
│ ├── __init__.py
|
||||||
|
│ ├── server.py # FastAPI/uvicorn server
|
||||||
|
│ ├── routes.py # OpenAI-compatible endpoints
|
||||||
|
│ └── middleware.py # Request handling
|
||||||
|
├── tests/
|
||||||
|
├── config/
|
||||||
|
│ └── models.yaml # Model configurations
|
||||||
|
├── scripts/
|
||||||
|
│ ├── install.bat # Windows installer
|
||||||
|
│ └── install.sh # Unix installer
|
||||||
|
├── main.py # CLI entry point
|
||||||
|
├── requirements.txt
|
||||||
|
├── requirements-macos.txt # MLX-specific deps
|
||||||
|
├── setup.py
|
||||||
|
└── .gitignore
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation Phases
|
||||||
|
|
||||||
|
### Phase 1: Foundation (Week 1)
|
||||||
|
|
||||||
|
#### 1.1 Hardware Detection Module
|
||||||
|
**File**: `src/hardware/detector.py`
|
||||||
|
|
||||||
|
**Requirements**:
|
||||||
|
- Cross-platform OS detection (Windows, macOS, Linux)
|
||||||
|
- CPU info (cores, architecture)
|
||||||
|
- RAM detection (total, available)
|
||||||
|
- GPU detection with VRAM
|
||||||
|
|
||||||
|
**Platform-specific implementations**:
|
||||||
|
- **Windows**: Use `pynvml` for NVIDIA, fallback to DirectX for others
|
||||||
|
- **macOS**: Use `psutil` for RAM, `sysctl` for CPU, Metal API for GPU
|
||||||
|
- **Linux**: Use `pynvml` for NVIDIA, `rocm-smi` for AMD
|
||||||
|
|
||||||
|
**Output structure**:
|
||||||
|
```python
|
||||||
|
class HardwareProfile:
|
||||||
|
os: str # 'windows', 'darwin', 'linux'
|
||||||
|
cpu_cores: int
|
||||||
|
ram_gb: float
|
||||||
|
gpu: Optional[GPUInfo]
|
||||||
|
is_apple_silicon: bool
|
||||||
|
```
|
||||||
|
|
||||||
|
**Model selection rules**:
|
||||||
|
- External GPU (NVIDIA/AMD): Use 100% of VRAM
|
||||||
|
- Apple Silicon: Use 50% of unified RAM
|
||||||
|
- CPU-only: Use 50% of system RAM
|
||||||
|
|
||||||
|
#### 1.2 Model Registry
|
||||||
|
**File**: `src/models/registry.py`
|
||||||
|
|
||||||
|
**Model database** (YAML format):
|
||||||
|
```yaml
|
||||||
|
models:
|
||||||
|
qwen2.5-coder:
|
||||||
|
name: "Qwen 2.5 Coder"
|
||||||
|
description: "Alibaba's code-focused model"
|
||||||
|
variants:
|
||||||
|
- size: 3b
|
||||||
|
base_vram_gb: 2.0 # Approximate VRAM for fp16
|
||||||
|
quantizations:
|
||||||
|
q4_k_m:
|
||||||
|
vram_gb: 1.8
|
||||||
|
quality: "good"
|
||||||
|
q5_k_m:
|
||||||
|
vram_gb: 2.2
|
||||||
|
quality: "better"
|
||||||
|
q6_k:
|
||||||
|
vram_gb: 2.6
|
||||||
|
quality: "best"
|
||||||
|
- size: 7b
|
||||||
|
base_vram_gb: 14.0
|
||||||
|
quantizations:
|
||||||
|
q4_k_m:
|
||||||
|
vram_gb: 4.5
|
||||||
|
q5_k_m:
|
||||||
|
vram_gb: 5.2
|
||||||
|
q6_k:
|
||||||
|
vram_gb: 6.0
|
||||||
|
|
||||||
|
codellama:
|
||||||
|
name: "CodeLlama"
|
||||||
|
# Similar structure...
|
||||||
|
|
||||||
|
deepseek-coder:
|
||||||
|
name: "DeepSeek Coder"
|
||||||
|
# Similar structure...
|
||||||
|
```
|
||||||
|
|
||||||
|
**Selection priority**:
|
||||||
|
1. Qwen 2.5 Coder (best for small sizes)
|
||||||
|
2. DeepSeek Coder (good alternative)
|
||||||
|
3. CodeLlama (fallback)
|
||||||
|
|
||||||
|
#### 1.3 Model Selector Logic
|
||||||
|
**File**: `src/models/selector.py`
|
||||||
|
|
||||||
|
**Algorithm**:
|
||||||
|
```python
|
||||||
|
def select_optimal_model(hardware: HardwareProfile) -> ModelConfig:
|
||||||
|
available_memory = get_available_memory(hardware)
|
||||||
|
|
||||||
|
# Try models in priority order
|
||||||
|
for model in PRIORITY_MODELS:
|
||||||
|
# Find largest size that fits
|
||||||
|
for variant in reversed(model.variants):
|
||||||
|
# Try highest quantization that fits
|
||||||
|
for quant in reversed(variant.quantizations):
|
||||||
|
total_vram_needed = quant.vram_gb * MIN_INSTANCES
|
||||||
|
if total_vram_needed <= available_memory:
|
||||||
|
# Calculate max instances
|
||||||
|
max_instances = int(available_memory // quant.vram_gb)
|
||||||
|
# Cap at reasonable limit (e.g., 8)
|
||||||
|
instances = min(max_instances, 8)
|
||||||
|
return ModelConfig(model, variant, quant, instances)
|
||||||
|
|
||||||
|
# Fallback to smallest model
|
||||||
|
return FALLBACK_CONFIG
|
||||||
|
```
|
||||||
|
|
||||||
|
**Minimum instances**: 2 (for consensus voting)
|
||||||
|
**Maximum instances**: 8 (to avoid overhead)
|
||||||
|
|
||||||
|
### Phase 2: Backend Integration (Week 2)
|
||||||
|
|
||||||
|
#### 2.1 Base Backend Interface
|
||||||
|
**File**: `src/backends/base.py`
|
||||||
|
|
||||||
|
```python
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import AsyncIterator
|
||||||
|
|
||||||
|
class LLMBackend(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
async def load_model(self, model_path: str, config: dict) -> bool:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def generate(self, prompt: str, **kwargs) -> str:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def generate_stream(self, prompt: str, **kwargs) -> AsyncIterator[str]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_memory_usage(self) -> float:
|
||||||
|
"""Return current VRAM/RAM usage in GB"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def shutdown(self):
|
||||||
|
pass
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2.2 llama.cpp Backend
|
||||||
|
**File**: `src/backends/llamacpp.py`
|
||||||
|
|
||||||
|
**Implementation**:
|
||||||
|
- Use `llama-cpp-python` library
|
||||||
|
- Support GGUF model format
|
||||||
|
- GPU acceleration via CUDA/Metal
|
||||||
|
- Server mode with HTTP API
|
||||||
|
|
||||||
|
**Key features**:
|
||||||
|
- Model caching to avoid reload
|
||||||
|
- Context window management
|
||||||
|
- Batch processing support
|
||||||
|
|
||||||
|
**Memory calculation**:
|
||||||
|
```python
|
||||||
|
def calculate_memory_usage(model_path: str) -> float:
|
||||||
|
# Parse GGUF metadata
|
||||||
|
# Return estimated VRAM usage
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2.3 MLX Backend (macOS)
|
||||||
|
**File**: `src/backends/mlx.py`
|
||||||
|
|
||||||
|
**Implementation**:
|
||||||
|
- Use `mlx-lm` library
|
||||||
|
- Support MLX format models
|
||||||
|
- Optimized for Apple Silicon
|
||||||
|
|
||||||
|
**Key differences from llama.cpp**:
|
||||||
|
- Native Metal performance
|
||||||
|
- Simpler API
|
||||||
|
- Unified memory model
|
||||||
|
|
||||||
|
### Phase 3: Swarm Management (Week 3)
|
||||||
|
|
||||||
|
#### 3.1 Worker Instance
|
||||||
|
**File**: `src/swarm/worker.py`
|
||||||
|
|
||||||
|
Each worker manages:
|
||||||
|
- One LLM instance
|
||||||
|
- Request queue
|
||||||
|
- Health monitoring
|
||||||
|
- Metrics collection
|
||||||
|
|
||||||
|
```python
|
||||||
|
class SwarmWorker:
|
||||||
|
def __init__(self, worker_id: int, backend: LLMBackend, config: dict):
|
||||||
|
self.worker_id = worker_id
|
||||||
|
self.backend = backend
|
||||||
|
self.is_healthy = True
|
||||||
|
self.request_count = 0
|
||||||
|
self.avg_latency = 0.0
|
||||||
|
|
||||||
|
async def process(self, request: GenerationRequest) -> GenerationResponse:
|
||||||
|
start = time.time()
|
||||||
|
response = await self.backend.generate(**request.params)
|
||||||
|
latency = time.time() - start
|
||||||
|
self._update_metrics(latency)
|
||||||
|
return GenerationResponse(response, latency, self.worker_id)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3.2 Swarm Manager
|
||||||
|
**File**: `src/swarm/manager.py`
|
||||||
|
|
||||||
|
Responsibilities:
|
||||||
|
- Spawn N workers based on hardware
|
||||||
|
- Distribute requests to all workers
|
||||||
|
- Collect responses
|
||||||
|
- Handle worker failures
|
||||||
|
|
||||||
|
```python
|
||||||
|
class SwarmManager:
|
||||||
|
def __init__(self, config: ModelConfig):
|
||||||
|
self.workers: List[SwarmWorker] = []
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
async def initialize(self):
|
||||||
|
# Download model if needed
|
||||||
|
model_path = await self._ensure_model()
|
||||||
|
|
||||||
|
# Spawn workers
|
||||||
|
for i in range(self.config.instances):
|
||||||
|
backend = self._create_backend()
|
||||||
|
await backend.load_model(model_path, self.config.backend_params)
|
||||||
|
worker = SwarmWorker(i, backend, self.config)
|
||||||
|
self.workers.append(worker)
|
||||||
|
|
||||||
|
async def generate_all(self, prompt: str, **kwargs) -> List[GenerationResponse]:
|
||||||
|
# Send to all workers in parallel
|
||||||
|
tasks = [w.process(request) for w in self.workers]
|
||||||
|
return await asyncio.gather(*tasks)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3.3 Consensus Algorithm
|
||||||
|
**File**: `src/swarm/consensus.py`
|
||||||
|
|
||||||
|
**Voting strategies**:
|
||||||
|
|
||||||
|
1. **Similarity voting** (default):
|
||||||
|
- Embed all responses
|
||||||
|
- Group by semantic similarity
|
||||||
|
- Return largest group
|
||||||
|
|
||||||
|
2. **Quality scoring**:
|
||||||
|
- Score each response on:
|
||||||
|
- Completeness (does it answer the question?)
|
||||||
|
- Code quality (syntax, structure)
|
||||||
|
- Length appropriateness
|
||||||
|
- Return highest score
|
||||||
|
|
||||||
|
3. **Latency-weighted**:
|
||||||
|
- Prefer faster responses (lower memory pressure)
|
||||||
|
|
||||||
|
**Implementation**:
|
||||||
|
```python
|
||||||
|
class ConsensusEngine:
|
||||||
|
def __init__(self, strategy: str = "similarity"):
|
||||||
|
self.strategy = strategy
|
||||||
|
self.embedding_model = None # Lazy load
|
||||||
|
|
||||||
|
async def select_best(self, responses: List[GenerationResponse]) -> str:
|
||||||
|
if len(responses) == 1:
|
||||||
|
return responses[0].text
|
||||||
|
|
||||||
|
if self.strategy == "similarity":
|
||||||
|
return await self._similarity_vote(responses)
|
||||||
|
elif self.strategy == "quality":
|
||||||
|
return await self._quality_score(responses)
|
||||||
|
else:
|
||||||
|
return self._fastest_response(responses)
|
||||||
|
|
||||||
|
async def _similarity_vote(self, responses: List[GenerationResponse]) -> str:
|
||||||
|
# Use sentence-transformers for embeddings
|
||||||
|
# Group by cosine similarity > 0.85
|
||||||
|
# Return median response from largest group
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 4: API Server (Week 4)
|
||||||
|
|
||||||
|
#### 4.1 OpenAI-Compatible Endpoints
|
||||||
|
**File**: `src/api/routes.py`
|
||||||
|
|
||||||
|
Required endpoints:
|
||||||
|
- `GET /v1/models` - List available models
|
||||||
|
- `POST /v1/chat/completions` - Chat completion
|
||||||
|
- `POST /v1/completions` - Text completion (optional)
|
||||||
|
- `GET /health` - Health check
|
||||||
|
- `GET /metrics` - Prometheus metrics (optional)
|
||||||
|
|
||||||
|
**Chat completions endpoint**:
|
||||||
|
```python
|
||||||
|
@app.post("/v1/chat/completions")
|
||||||
|
async def chat_completions(request: ChatCompletionRequest):
|
||||||
|
# Extract messages
|
||||||
|
messages = request.messages
|
||||||
|
prompt = format_messages(messages)
|
||||||
|
|
||||||
|
# Get all responses from swarm
|
||||||
|
responses = await swarm_manager.generate_all(prompt, **request.params)
|
||||||
|
|
||||||
|
# Run consensus
|
||||||
|
best_response = await consensus_engine.select_best(responses)
|
||||||
|
|
||||||
|
# Format as OpenAI response
|
||||||
|
return {
|
||||||
|
"id": f"chatcmpl-{uuid4()}",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"created": int(time.time()),
|
||||||
|
"model": request.model,
|
||||||
|
"choices": [{
|
||||||
|
"index": 0,
|
||||||
|
"message": {"role": "assistant", "content": best_response},
|
||||||
|
"finish_reason": "stop"
|
||||||
|
}],
|
||||||
|
"usage": calculate_usage(prompt, best_response)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 4.2 Streaming Support
|
||||||
|
**File**: `src/api/routes.py`
|
||||||
|
|
||||||
|
For streaming, use the fastest worker instead of consensus:
|
||||||
|
```python
|
||||||
|
if request.stream:
|
||||||
|
# Pick worker with lowest latency
|
||||||
|
worker = swarm_manager.get_fastest_worker()
|
||||||
|
return StreamingResponse(
|
||||||
|
worker.stream_generate(prompt),
|
||||||
|
media_type="text/event-stream"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 5: CLI & Distribution (Week 5)
|
||||||
|
|
||||||
|
#### 5.1 CLI Interface
|
||||||
|
**File**: `main.py`
|
||||||
|
|
||||||
|
Commands:
|
||||||
|
```bash
|
||||||
|
# Start the swarm (auto-detect hardware)
|
||||||
|
python -m local_swarm
|
||||||
|
|
||||||
|
# Start with specific model
|
||||||
|
python -m local_swarm --model qwen2.5-coder:3b:q4
|
||||||
|
|
||||||
|
# Start with specific port
|
||||||
|
python -m local_swarm --port 8080
|
||||||
|
|
||||||
|
# Override instance count
|
||||||
|
python -m local_swarm --instances 4
|
||||||
|
|
||||||
|
# Show hardware detection
|
||||||
|
python -m local_swarm --detect
|
||||||
|
|
||||||
|
# Download models only
|
||||||
|
python -m local_swarm --download-only
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 5.2 Configuration File
|
||||||
|
**File**: `config.yaml`
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
server:
|
||||||
|
host: "127.0.0.1"
|
||||||
|
port: 8000
|
||||||
|
|
||||||
|
swarm:
|
||||||
|
consensus_strategy: "similarity" # similarity, quality, fastest
|
||||||
|
min_instances: 2
|
||||||
|
max_instances: 8
|
||||||
|
timeout: 60
|
||||||
|
|
||||||
|
models:
|
||||||
|
cache_dir: "~/.local_swarm/models"
|
||||||
|
preferred_models:
|
||||||
|
- qwen2.5-coder
|
||||||
|
- deepseek-coder
|
||||||
|
- codellama
|
||||||
|
|
||||||
|
hardware:
|
||||||
|
gpu_memory_fraction: 1.0 # Use 100% of GPU VRAM
|
||||||
|
ram_fraction: 0.5 # Use 50% of system RAM for CPU/Apple Silicon
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 5.3 Installation Scripts
|
||||||
|
|
||||||
|
**Windows** (`scripts/install.bat`):
|
||||||
|
```batch
|
||||||
|
@echo off
|
||||||
|
echo Installing Local Swarm...
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
:: Check for CUDA
|
||||||
|
nvidia-smi >nul 2>&1
|
||||||
|
if %errorlevel% == 0 (
|
||||||
|
echo CUDA detected, installing GPU support...
|
||||||
|
pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
||||||
|
) else (
|
||||||
|
echo No CUDA detected, using CPU backend...
|
||||||
|
pip install llama-cpp-python
|
||||||
|
)
|
||||||
|
|
||||||
|
echo Installation complete!
|
||||||
|
echo Run: python -m local_swarm
|
||||||
|
```
|
||||||
|
|
||||||
|
**macOS/Linux** (`scripts/install.sh`):
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "Installing Local Swarm..."
|
||||||
|
pip install --upgrade pip
|
||||||
|
|
||||||
|
# Detect platform
|
||||||
|
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||||
|
echo "macOS detected..."
|
||||||
|
pip install -r requirements.txt
|
||||||
|
pip install -r requirements-macos.txt
|
||||||
|
elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
|
||||||
|
echo "Linux detected..."
|
||||||
|
pip install -r requirements.txt
|
||||||
|
if command -v nvidia-smi &> /dev/null; then
|
||||||
|
echo "CUDA detected, installing GPU support..."
|
||||||
|
pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
||||||
|
else
|
||||||
|
pip install llama-cpp-python
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Installation complete!"
|
||||||
|
echo "Run: python -m local_swarm"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 6: Testing & Polish (Week 6)
|
||||||
|
|
||||||
|
#### 6.1 Test Coverage
|
||||||
|
|
||||||
|
**Unit tests**:
|
||||||
|
- Hardware detection mocking
|
||||||
|
- Model selection logic
|
||||||
|
- Consensus algorithm
|
||||||
|
- API endpoint validation
|
||||||
|
|
||||||
|
**Integration tests**:
|
||||||
|
- End-to-end inference
|
||||||
|
- Multi-worker coordination
|
||||||
|
- Error handling
|
||||||
|
|
||||||
|
**Platform tests**:
|
||||||
|
- Windows with NVIDIA
|
||||||
|
- macOS with M1/M2/M3
|
||||||
|
- Linux with CUDA
|
||||||
|
- CPU-only fallback
|
||||||
|
|
||||||
|
#### 6.2 Performance Optimization
|
||||||
|
|
||||||
|
- **Model warmup**: Pre-load models on startup
|
||||||
|
- **Request batching**: Group similar requests
|
||||||
|
- **Worker pooling**: Reuse workers instead of respawning
|
||||||
|
- **Memory monitoring**: Auto-shutdown if OOM
|
||||||
|
|
||||||
|
#### 6.3 Documentation
|
||||||
|
|
||||||
|
- API documentation (OpenAPI spec)
|
||||||
|
- Configuration guide
|
||||||
|
- Troubleshooting
|
||||||
|
- Performance tuning tips
|
||||||
|
|
||||||
|
## Technical Decisions
|
||||||
|
|
||||||
|
### Why llama.cpp?
|
||||||
|
- Best cross-platform support
|
||||||
|
- Mature quantization formats (GGUF)
|
||||||
|
- Active community
|
||||||
|
- Good performance/quality tradeoff
|
||||||
|
|
||||||
|
### Why MLX for macOS?
|
||||||
|
- Native Apple Silicon optimization
|
||||||
|
- Simpler than llama.cpp on macOS
|
||||||
|
- Better unified memory handling
|
||||||
|
|
||||||
|
### Why consensus voting?
|
||||||
|
- Improves response quality vs single model
|
||||||
|
- Uses available hardware efficiently
|
||||||
|
- Can detect model hallucinations
|
||||||
|
|
||||||
|
### Memory Model
|
||||||
|
|
||||||
|
**External GPU (NVIDIA/AMD)**:
|
||||||
|
- Use 100% of VRAM
|
||||||
|
- Keep 10% buffer for OS/drivers
|
||||||
|
- Each instance gets equal share
|
||||||
|
|
||||||
|
**Apple Silicon**:
|
||||||
|
- Use 50% of unified RAM
|
||||||
|
- Avoid system swap
|
||||||
|
- Monitor memory pressure
|
||||||
|
|
||||||
|
**CPU-only**:
|
||||||
|
- Use 50% of system RAM
|
||||||
|
- Dependent on available memory
|
||||||
|
- Slower but functional
|
||||||
|
|
||||||
|
## Future Enhancements
|
||||||
|
|
||||||
|
1. **Multi-GPU support**: Distribute across multiple GPUs
|
||||||
|
2. **Dynamic scaling**: Add/remove workers based on load
|
||||||
|
3. **Model mixing**: Different models in same swarm
|
||||||
|
4. **Fine-tuning**: Local fine-tuning on user data
|
||||||
|
5. **Web UI**: Browser-based configuration
|
||||||
|
6. **Docker support**: Containerized deployment
|
||||||
|
7. **Cloud inference**: Fallback to cloud APIs
|
||||||
|
|
||||||
|
## Success Metrics
|
||||||
|
|
||||||
|
- **Startup time**: < 30 seconds from cold start
|
||||||
|
- **First inference**: < 10 seconds after startup
|
||||||
|
- **Concurrent requests**: Support 2-8 parallel inferences
|
||||||
|
- **Consensus accuracy**: > 80% agreement on code tasks
|
||||||
|
- **Memory efficiency**: Use > 80% of available memory
|
||||||
|
- **Cross-platform**: Works on Windows/macOS/Linux without code changes
|
||||||
@@ -0,0 +1,352 @@
|
|||||||
|
# Local Swarm
|
||||||
|
|
||||||
|
Automatically configure and run a swarm of small coding LLMs optimized for your hardware. Provides an OpenAI-compatible API for seamless integration with opencode and other tools.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Hardware Auto-Detection**: Automatically detects your GPU (NVIDIA), Apple Silicon, or CPU and selects optimal settings
|
||||||
|
- **Smart Model Selection**: Chooses the best model, quantization, and instance count based on available VRAM/RAM
|
||||||
|
- **Swarm Consensus**: Multiple LLM instances vote on the best response for higher quality outputs
|
||||||
|
- **OpenAI-Compatible API**: Drop-in replacement for OpenAI API at `http://localhost:8000/v1`
|
||||||
|
- **Cross-Platform**: Works on Windows, macOS, and Linux with automatic backend selection
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
#### Windows (PowerShell)
|
||||||
|
```powershell
|
||||||
|
# Clone the repository
|
||||||
|
git clone https://github.com/yourusername/local_swarm.git
|
||||||
|
cd local_swarm
|
||||||
|
|
||||||
|
# Run installer
|
||||||
|
.\scripts\install.bat
|
||||||
|
```
|
||||||
|
|
||||||
|
#### macOS/Linux
|
||||||
|
```bash
|
||||||
|
# Clone the repository
|
||||||
|
git clone https://github.com/yourusername/local_swarm.git
|
||||||
|
cd local_swarm
|
||||||
|
|
||||||
|
# Run installer
|
||||||
|
chmod +x scripts/install.sh
|
||||||
|
./scripts/install.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
#### Start the Swarm
|
||||||
|
```bash
|
||||||
|
# Auto-detect hardware and start
|
||||||
|
python -m local_swarm
|
||||||
|
|
||||||
|
# Or use the CLI
|
||||||
|
python main.py
|
||||||
|
```
|
||||||
|
|
||||||
|
On first run, the tool will:
|
||||||
|
1. Scan your hardware (GPU, RAM, CPU)
|
||||||
|
2. Select the optimal model and quantization
|
||||||
|
3. Download the model (one-time)
|
||||||
|
4. Start multiple instances based on available memory
|
||||||
|
5. Expose the API at `http://localhost:8000`
|
||||||
|
|
||||||
|
Example startup output:
|
||||||
|
```
|
||||||
|
🔍 Detecting hardware...
|
||||||
|
OS: Windows 11
|
||||||
|
GPU: NVIDIA GeForce RTX 4060 Ti (16 GB VRAM)
|
||||||
|
CPU: 16 cores
|
||||||
|
RAM: 32 GB
|
||||||
|
|
||||||
|
📊 Optimal configuration:
|
||||||
|
Model: Qwen 2.5 Coder 3B
|
||||||
|
Quantization: Q4_K_M (1.8 GB per instance)
|
||||||
|
Instances: 8 (using 14.4 GB VRAM)
|
||||||
|
|
||||||
|
⬇️ Downloading model...
|
||||||
|
Progress: 100% ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓ 1.8/1.8 GB
|
||||||
|
|
||||||
|
🚀 Starting swarm...
|
||||||
|
Worker 1: Ready (GPU:0)
|
||||||
|
Worker 2: Ready (GPU:0)
|
||||||
|
...
|
||||||
|
Worker 8: Ready (GPU:0)
|
||||||
|
|
||||||
|
✅ Local Swarm is running!
|
||||||
|
API: http://localhost:8000/v1
|
||||||
|
Models: http://localhost:8000/v1/models
|
||||||
|
Health: http://localhost:8000/health
|
||||||
|
|
||||||
|
💡 Configure opencode to use:
|
||||||
|
base_url: http://localhost:8000/v1
|
||||||
|
api_key: any (not used)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Configure opencode
|
||||||
|
|
||||||
|
Add to your opencode configuration:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"model": {
|
||||||
|
"provider": "openai",
|
||||||
|
"base_url": "http://localhost:8000/v1",
|
||||||
|
"api_key": "not-needed",
|
||||||
|
"model": "local-swarm"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Create a `config.yaml` file for customization:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
server:
|
||||||
|
host: "127.0.0.1"
|
||||||
|
port: 8000
|
||||||
|
|
||||||
|
swarm:
|
||||||
|
consensus_strategy: "similarity" # similarity, quality, fastest
|
||||||
|
min_instances: 2
|
||||||
|
max_instances: 8
|
||||||
|
|
||||||
|
hardware:
|
||||||
|
gpu_memory_fraction: 1.0 # Use 100% of GPU VRAM
|
||||||
|
ram_fraction: 0.5 # Use 50% of system RAM for CPU/Apple Silicon
|
||||||
|
|
||||||
|
models:
|
||||||
|
cache_dir: "~/.local_swarm/models"
|
||||||
|
```
|
||||||
|
|
||||||
|
## CLI Options
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Show hardware detection without starting
|
||||||
|
python -m local_swarm --detect
|
||||||
|
|
||||||
|
# Use specific model
|
||||||
|
python -m local_swarm --model qwen2.5-coder:3b:q4
|
||||||
|
|
||||||
|
# Use specific port
|
||||||
|
python -m local_swarm --port 8080
|
||||||
|
|
||||||
|
# Force number of instances
|
||||||
|
python -m local_swarm --instances 4
|
||||||
|
|
||||||
|
# Download models only (no server)
|
||||||
|
python -m local_swarm --download-only
|
||||||
|
|
||||||
|
# Show help
|
||||||
|
python -m local_swarm --help
|
||||||
|
```
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
### Hardware Detection
|
||||||
|
|
||||||
|
The tool automatically detects your system:
|
||||||
|
- **Windows**: NVIDIA GPUs via NVML, DirectX fallback
|
||||||
|
- **macOS**: Apple Silicon via Metal, unified memory model
|
||||||
|
- **Linux**: NVIDIA (NVML), AMD (ROCm)
|
||||||
|
|
||||||
|
### Model Selection
|
||||||
|
|
||||||
|
Based on available memory:
|
||||||
|
1. **External GPU**: Use 100% of VRAM minus OS overhead
|
||||||
|
2. **Apple Silicon**: Use 50% of unified RAM
|
||||||
|
3. **CPU-only**: Use 50% of system RAM
|
||||||
|
|
||||||
|
The algorithm selects:
|
||||||
|
- Largest model size that fits
|
||||||
|
- Highest quantization quality possible
|
||||||
|
- Maximum instances (2-8) based on memory
|
||||||
|
|
||||||
|
Example configurations:
|
||||||
|
|
||||||
|
| Hardware | Model | Quant | Instances | Memory Used |
|
||||||
|
|----------|-------|-------|-----------|-------------|
|
||||||
|
| RTX 4060 Ti 16GB | Qwen 2.5 7B | Q4_K_M | 3 | ~13.5 GB |
|
||||||
|
| RTX 4060 Ti 8GB | Qwen 2.5 3B | Q6_K | 4 | ~10.4 GB |
|
||||||
|
| M3 Pro 36GB | Qwen 2.5 7B | Q4_K_M | 4 | ~18 GB |
|
||||||
|
| M1 8GB | Qwen 2.5 3B | Q4_K_M | 2 | ~3.6 GB |
|
||||||
|
| CPU 32GB | Qwen 2.5 3B | Q4_K_M | 8 | ~14.4 GB |
|
||||||
|
|
||||||
|
### Swarm Consensus
|
||||||
|
|
||||||
|
For each request, the swarm:
|
||||||
|
1. Sends the prompt to all running instances
|
||||||
|
2. Collects responses in parallel
|
||||||
|
3. Runs consensus algorithm:
|
||||||
|
- **Similarity**: Groups responses by semantic similarity, returns largest group
|
||||||
|
- **Quality**: Scores responses on completeness and code quality
|
||||||
|
- **Fastest**: Returns the quickest response
|
||||||
|
4. Returns the winning response via OpenAI-compatible API
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
### GET /v1/models
|
||||||
|
List available models
|
||||||
|
|
||||||
|
### POST /v1/chat/completions
|
||||||
|
Chat completion with consensus
|
||||||
|
|
||||||
|
**Request**:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"model": "local-swarm",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "Write a Python function to sort a list"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response**:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id": "chatcmpl-abc123",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"created": 1234567890,
|
||||||
|
"model": "local-swarm",
|
||||||
|
"choices": [{
|
||||||
|
"index": 0,
|
||||||
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "def sort_list(lst):\n return sorted(lst)"
|
||||||
|
},
|
||||||
|
"finish_reason": "stop"
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### GET /health
|
||||||
|
Health check
|
||||||
|
|
||||||
|
### GET /metrics
|
||||||
|
Prometheus metrics (optional)
|
||||||
|
|
||||||
|
## Supported Models
|
||||||
|
|
||||||
|
Currently supported models (auto-selected based on hardware):
|
||||||
|
|
||||||
|
- **Qwen 2.5 Coder** (3B, 7B, 14B) - Recommended for coding tasks
|
||||||
|
- **DeepSeek Coder** (1.3B, 6.7B, 33B) - Good alternative
|
||||||
|
- **CodeLlama** (7B, 13B, 34B) - Meta's code model
|
||||||
|
|
||||||
|
All models support GGUF quantization:
|
||||||
|
- Q4_K_M - Good quality, smallest size (recommended)
|
||||||
|
- Q5_K_M - Better quality
|
||||||
|
- Q6_K - Best quality
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Out of Memory
|
||||||
|
If you get OOM errors:
|
||||||
|
```bash
|
||||||
|
# Reduce instances
|
||||||
|
python -m local_swarm --instances 2
|
||||||
|
|
||||||
|
# Or use smaller model
|
||||||
|
python -m local_swarm --model qwen2.5-coder:3b:q4
|
||||||
|
```
|
||||||
|
|
||||||
|
### Slow Performance
|
||||||
|
- Check GPU utilization with `nvidia-smi` (NVIDIA) or Activity Monitor (macOS)
|
||||||
|
- Ensure model is cached (first run downloads to `~/.local_swarm/models`)
|
||||||
|
- Try reducing instances to avoid contention
|
||||||
|
|
||||||
|
### Windows: CUDA not detected
|
||||||
|
Make sure NVIDIA drivers are installed:
|
||||||
|
```powershell
|
||||||
|
nvidia-smi
|
||||||
|
```
|
||||||
|
If this fails, reinstall drivers from nvidia.com
|
||||||
|
|
||||||
|
### macOS: MLX not found
|
||||||
|
```bash
|
||||||
|
pip install mlx-lm
|
||||||
|
```
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Python 3.9+
|
||||||
|
- 4GB+ RAM (8GB+ recommended)
|
||||||
|
- Optional: NVIDIA GPU with 4GB+ VRAM
|
||||||
|
- Optional: Apple Silicon Mac
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install dev dependencies
|
||||||
|
pip install -r requirements-dev.txt
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
pytest
|
||||||
|
|
||||||
|
# Run specific platform tests
|
||||||
|
pytest tests/test_hardware.py -v
|
||||||
|
|
||||||
|
# Format code
|
||||||
|
black src/
|
||||||
|
ruff check src/
|
||||||
|
```
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────┐
|
||||||
|
│ OpenAI API Client │
|
||||||
|
│ (opencode, etc.) │
|
||||||
|
└─────────────┬───────────────────────┘
|
||||||
|
│ HTTP
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────┐
|
||||||
|
│ Local Swarm API Server │
|
||||||
|
│ (FastAPI / localhost:8000) │
|
||||||
|
└─────────────┬───────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────┐
|
||||||
|
│ Swarm Manager │
|
||||||
|
│ ┌─────────┐ ┌─────────┐ │
|
||||||
|
│ │ Worker 1│ │ Worker 2│ ... │
|
||||||
|
│ │(LLM #1) │ │(LLM #2) │ │
|
||||||
|
│ └────┬────┘ └────┬────┘ │
|
||||||
|
│ │ │ │
|
||||||
|
│ └─────┬─────┘ │
|
||||||
|
│ ▼ │
|
||||||
|
│ Consensus Engine │
|
||||||
|
└─────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────┐
|
||||||
|
│ Backend (llama.cpp / MLX) │
|
||||||
|
│ ┌─────────────────────┐ │
|
||||||
|
│ │ GGUF/MLX Model │ │
|
||||||
|
│ │ (Qwen/Codellama) │ │
|
||||||
|
│ └─────────────────────┘ │
|
||||||
|
└─────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────┐
|
||||||
|
│ Hardware (GPU/CPU/Apple Silicon) │
|
||||||
|
└─────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT License - See LICENSE file
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
Contributions welcome! Please read CONTRIBUTING.md first.
|
||||||
|
|
||||||
|
## Acknowledgments
|
||||||
|
|
||||||
|
- [llama.cpp](https://github.com/ggerganov/llama.cpp) - Inference engine
|
||||||
|
- [MLX](https://github.com/ml-explore/mlx) - Apple Silicon backend
|
||||||
|
- [Qwen](https://github.com/QwenLM/Qwen) - Model family
|
||||||
|
- [HuggingFace](https://huggingface.co) - Model hosting
|
||||||
@@ -0,0 +1,96 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Local Swarm - Automatically configure and run a swarm of small coding LLMs
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add src to path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent / "src"))
|
||||||
|
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.panel import Panel
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Local Swarm - AI-powered coding LLM swarm",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog="""
|
||||||
|
Examples:
|
||||||
|
python main.py # Auto-detect and start
|
||||||
|
python main.py --detect # Show hardware detection only
|
||||||
|
python main.py --model qwen:3b:q4 # Use specific model
|
||||||
|
python main.py --port 8080 # Use custom port
|
||||||
|
python main.py --instances 4 # Force 4 instances
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--detect",
|
||||||
|
action="store_true",
|
||||||
|
help="Show hardware detection and exit"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--model",
|
||||||
|
type=str,
|
||||||
|
help="Model to use (format: name:size:quant, e.g., qwen:3b:q4)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--port",
|
||||||
|
type=int,
|
||||||
|
default=8000,
|
||||||
|
help="Port to run the API server on (default: 8000)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--instances",
|
||||||
|
type=int,
|
||||||
|
help="Force number of instances (overrides auto-calculation)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--download-only",
|
||||||
|
action="store_true",
|
||||||
|
help="Download models only, don't start server"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--config",
|
||||||
|
type=str,
|
||||||
|
default="config.yaml",
|
||||||
|
help="Path to config file"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--version",
|
||||||
|
action="version",
|
||||||
|
version="%(prog)s 0.1.0"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Show welcome
|
||||||
|
console.print(Panel.fit(
|
||||||
|
"[bold blue]Local Swarm[/bold blue] - AI-powered coding LLM swarm\n"
|
||||||
|
"Automatically configures optimal LLM setup for your hardware",
|
||||||
|
title="Welcome",
|
||||||
|
border_style="blue"
|
||||||
|
))
|
||||||
|
|
||||||
|
if args.detect:
|
||||||
|
console.print("[yellow]Hardware detection mode - not yet implemented[/yellow]")
|
||||||
|
console.print("Run without --detect to start the swarm (once implemented)")
|
||||||
|
return
|
||||||
|
|
||||||
|
console.print("[green]Starting Local Swarm...[/green]")
|
||||||
|
console.print("[dim]Note: This is a placeholder. Implementation in progress.[/dim]")
|
||||||
|
console.print()
|
||||||
|
console.print("[bold]Next steps:[/bold]")
|
||||||
|
console.print("1. Check PLAN.md for implementation details")
|
||||||
|
console.print("2. Start implementing src/hardware/detector.py")
|
||||||
|
console.print("3. Continue with other modules")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
# macOS specific dependencies
|
||||||
|
mlx>=0.15.0
|
||||||
|
mlx-lm>=0.8.0
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
# Core dependencies
|
||||||
|
pydantic>=2.0.0
|
||||||
|
pyyaml>=6.0
|
||||||
|
requests>=2.31.0
|
||||||
|
tqdm>=4.65.0
|
||||||
|
psutil>=5.9.0
|
||||||
|
|
||||||
|
# API server
|
||||||
|
fastapi>=0.104.0
|
||||||
|
uvicorn[standard]>=0.24.0
|
||||||
|
|
||||||
|
# Hardware detection
|
||||||
|
pynvml>=11.5.0
|
||||||
|
|
||||||
|
# ML/Embeddings (for consensus)
|
||||||
|
sentence-transformers>=2.2.0
|
||||||
|
numpy>=1.24.0
|
||||||
|
|
||||||
|
# llama.cpp (CPU version, GPU version installed via scripts)
|
||||||
|
llama-cpp-python>=0.2.0
|
||||||
|
|
||||||
|
# Async
|
||||||
|
aiohttp>=3.9.0
|
||||||
|
asyncio>=3.4.3
|
||||||
|
|
||||||
|
# CLI
|
||||||
|
click>=8.1.0
|
||||||
|
rich>=13.0.0
|
||||||
@@ -0,0 +1,55 @@
|
|||||||
|
@echo off
|
||||||
|
echo ==========================================
|
||||||
|
echo Local Swarm - Windows Installer
|
||||||
|
echo ==========================================
|
||||||
|
echo.
|
||||||
|
|
||||||
|
REM Check Python
|
||||||
|
python --version >nul 2>&1
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [ERROR] Python is not installed or not in PATH
|
||||||
|
echo Please install Python 3.9+ from https://python.org
|
||||||
|
exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
echo [1/4] Checking Python version...
|
||||||
|
for /f "tokens=2" %%a in ('python --version') do set PYTHON_VERSION=%%a
|
||||||
|
echo Found Python %PYTHON_VERSION%
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo [2/4] Upgrading pip...
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo [3/4] Installing base dependencies...
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
REM Check for CUDA
|
||||||
|
nvidia-smi >nul 2>&1
|
||||||
|
if %errorlevel% == 0 (
|
||||||
|
echo.
|
||||||
|
echo [4/4] CUDA detected! Installing GPU-accelerated llama.cpp...
|
||||||
|
pip uninstall -y llama-cpp-python
|
||||||
|
pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
||||||
|
echo GPU support enabled!
|
||||||
|
) else (
|
||||||
|
echo.
|
||||||
|
echo [4/4] No CUDA detected, using CPU backend...
|
||||||
|
echo CPU-only mode (slower but works on any hardware)
|
||||||
|
)
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo ==========================================
|
||||||
|
echo Installation Complete!
|
||||||
|
echo ==========================================
|
||||||
|
echo.
|
||||||
|
echo To start Local Swarm:
|
||||||
|
echo python main.py
|
||||||
|
echo.
|
||||||
|
echo To check hardware detection:
|
||||||
|
echo python main.py --detect
|
||||||
|
echo.
|
||||||
|
echo For more options:
|
||||||
|
echo python main.py --help
|
||||||
|
echo.
|
||||||
|
pause
|
||||||
Executable
+98
@@ -0,0 +1,98 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "=========================================="
|
||||||
|
echo " Local Swarm - Installer"
|
||||||
|
echo "=========================================="
|
||||||
|
echo
|
||||||
|
|
||||||
|
# Colors
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Check Python
|
||||||
|
if ! command -v python3 &> /dev/null; then
|
||||||
|
echo -e "${RED}[ERROR] Python 3 is not installed${NC}"
|
||||||
|
echo "Please install Python 3.9+ and try again"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[1/4] Checking Python version..."
|
||||||
|
PYTHON_VERSION=$(python3 --version | cut -d' ' -f2)
|
||||||
|
echo " Found Python $PYTHON_VERSION"
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "[2/4] Upgrading pip..."
|
||||||
|
python3 -m pip install --upgrade pip
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "[3/4] Installing base dependencies..."
|
||||||
|
pip3 install -r requirements.txt
|
||||||
|
|
||||||
|
# Detect platform and install appropriate backend
|
||||||
|
echo
|
||||||
|
echo "[4/4] Detecting hardware and installing backend..."
|
||||||
|
|
||||||
|
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||||
|
# macOS
|
||||||
|
echo " Platform: macOS"
|
||||||
|
|
||||||
|
# Check for Apple Silicon
|
||||||
|
if [[ $(uname -m) == "arm64" ]]; then
|
||||||
|
echo " Hardware: Apple Silicon detected!"
|
||||||
|
echo " Installing MLX backend..."
|
||||||
|
pip3 install -r requirements-macos.txt
|
||||||
|
echo " ${GREEN}MLX backend installed!${NC}"
|
||||||
|
else
|
||||||
|
echo " Hardware: Intel Mac"
|
||||||
|
echo " Installing llama.cpp (CPU)..."
|
||||||
|
pip3 install llama-cpp-python
|
||||||
|
echo " ${GREEN}llama.cpp installed (CPU mode)${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
|
||||||
|
# Linux
|
||||||
|
echo " Platform: Linux"
|
||||||
|
|
||||||
|
# Check for NVIDIA GPU
|
||||||
|
if command -v nvidia-smi &> /dev/null; then
|
||||||
|
echo " Hardware: NVIDIA GPU detected!"
|
||||||
|
echo " Installing CUDA-enabled llama.cpp..."
|
||||||
|
pip3 uninstall -y llama-cpp-python 2>/dev/null || true
|
||||||
|
pip3 install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
||||||
|
echo " ${GREEN}GPU support enabled!${NC}"
|
||||||
|
else
|
||||||
|
echo " Hardware: No NVIDIA GPU detected"
|
||||||
|
echo " Installing llama.cpp (CPU)..."
|
||||||
|
pip3 install llama-cpp-python
|
||||||
|
echo " ${GREEN}CPU backend installed${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check for AMD GPU (ROCm)
|
||||||
|
if command -v rocm-smi &> /dev/null; then
|
||||||
|
echo -e "${YELLOW}[WARNING] AMD GPU detected but ROCm support is experimental${NC}"
|
||||||
|
echo " Using CPU backend for now"
|
||||||
|
fi
|
||||||
|
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW}[WARNING] Unknown platform: $OSTYPE${NC}"
|
||||||
|
echo " Installing generic CPU backend..."
|
||||||
|
pip3 install llama-cpp-python
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "=========================================="
|
||||||
|
echo " Installation Complete!"
|
||||||
|
echo "=========================================="
|
||||||
|
echo
|
||||||
|
echo "To start Local Swarm:"
|
||||||
|
echo " python3 main.py"
|
||||||
|
echo
|
||||||
|
echo "To check hardware detection:"
|
||||||
|
echo " python3 main.py --detect"
|
||||||
|
echo
|
||||||
|
echo "For more options:"
|
||||||
|
echo " python3 main.py --help"
|
||||||
|
echo
|
||||||
@@ -0,0 +1,48 @@
|
|||||||
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
with open("README.md", "r", encoding="utf-8") as fh:
|
||||||
|
long_description = fh.read()
|
||||||
|
|
||||||
|
with open("requirements.txt", "r", encoding="utf-8") as fh:
|
||||||
|
requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="local-swarm",
|
||||||
|
version="0.1.0",
|
||||||
|
author="Local Swarm Contributors",
|
||||||
|
description="Automatically configure and run a swarm of small coding LLMs",
|
||||||
|
long_description=long_description,
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
|
url="https://github.com/yourusername/local_swarm",
|
||||||
|
packages=find_packages(where="src"),
|
||||||
|
package_dir={"": "src"},
|
||||||
|
classifiers=[
|
||||||
|
"Development Status :: 3 - Alpha",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.9",
|
||||||
|
"Programming Language :: Python :: 3.10",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
|
"Programming Language :: Python :: 3.12",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
],
|
||||||
|
python_requires=">=3.9",
|
||||||
|
install_requires=requirements,
|
||||||
|
extras_require={
|
||||||
|
"macos": ["mlx>=0.15.0", "mlx-lm>=0.8.0"],
|
||||||
|
"dev": [
|
||||||
|
"pytest>=7.4.0",
|
||||||
|
"pytest-asyncio>=0.21.0",
|
||||||
|
"black>=23.0.0",
|
||||||
|
"ruff>=0.1.0",
|
||||||
|
"mypy>=1.6.0",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
entry_points={
|
||||||
|
"console_scripts": [
|
||||||
|
"local-swarm=main:main",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user