Files
local_swarm/src/models/selector.py
T
sleepy 2461f45ca8 fix: Remove slow HF API check from recommended config selection
- select_optimal_model was checking HF API for available quantizations
- This caused menu to hang/slow down when changing context
- Now only checks availability when browsing or custom config
- Recommended config uses default quantizations (faster)
2026-02-23 23:54:57 +01:00

419 lines
16 KiB
Python

"""Model selection logic for Local Swarm."""
from dataclasses import dataclass
from typing import Optional, List
from hardware.detector import HardwareProfile
from models.registry import Model, ModelVariant, QuantizationConfig, list_models
@dataclass
class ModelConfig:
"""Configuration for running a model in the swarm."""
model: Model
variant: ModelVariant
quantization: QuantizationConfig
instances: int
memory_per_instance_gb: float
total_memory_gb: float
context_size: int = 32768 # Context window in tokens (16K, 32K, 64K, 128K)
offload_percent: float = 0.0 # Percentage of layers offloaded to RAM (0.0, 0.2, 0.5)
vram_usage_gb: float = 0.0 # Actual VRAM usage per instance
ram_usage_gb: float = 0.0 # System RAM usage per instance (when offloading)
def __post_init__(self):
"""Ensure default values are set if not provided."""
if not hasattr(self, 'context_size') or self.context_size is None:
object.__setattr__(self, 'context_size', 32768)
if not hasattr(self, 'offload_percent') or self.offload_percent is None:
object.__setattr__(self, 'offload_percent', 0.0)
def __repr__(self) -> str:
return (f"ModelConfig({self.model.name} {self.variant.size} "
f"{self.quantization.name}, {self.instances} instances, "
f"{self.total_memory_gb:.1f}GB total)")
@property
def model_id(self) -> str:
return f"{self.model.id}:{self.variant.size}:{self.quantization.name}"
@property
def display_name(self) -> str:
offload_str = f"+{int(self.offload_percent*100)}% offload" if self.offload_percent > 0 else ""
return f"{self.model.name} {self.variant.size} ({self.quantization.name}, {self.context_size//1000}K ctx{offload_str})"
# Configuration constraints
MIN_INSTANCES = 1 # Allow 1 instance (needed for Apple Silicon MLX)
MAX_INSTANCES = 8
OPTIMAL_MAX_INSTANCES = 5 # Sweet spot for consensus (85-90% benefit)
MEMORY_OVERHEAD_FACTOR = 0.95 # Leave 5% buffer
# Apple Silicon MLX constraints - MLX uses GPU efficiently with 1 worker
MLX_MAX_INSTANCES = 1 # MLX handles all GPU resources in single instance
# Context window options
CONTEXT_OPTIONS = {
16384: "16K tokens",
32768: "32K tokens (default)",
65536: "64K tokens",
131072: "128K tokens"
}
# Offloading options
OFFLOAD_OPTIONS = {
0.0: "No offload (default) - 100% GPU",
0.2: "20% offload - 80% GPU, 20% RAM",
0.5: "50% offload - 50% GPU, 50% RAM"
}
def calculate_context_memory(context_size: int, quantization_bits: int = 4) -> float:
"""
Calculate additional memory needed for KV cache based on context size.
Args:
context_size: Number of tokens in context window
quantization_bits: Quantization bits (4 for Q4, 5 for Q5, etc.)
Returns:
Additional VRAM needed in GB
"""
# KV cache memory per token: 2 * num_layers * hidden_dim * bytes_per_param
# Rough estimate: ~0.5MB per 1K tokens for 4-bit quantization
bytes_per_token = (quantization_bits / 8) * 0.5 # 0.5 MB base per token at fp16
memory_mb = (context_size / 1000) * bytes_per_token * 1000
return memory_mb / 1024 # Convert to GB
def calculate_memory_with_offload(
base_vram_gb: float,
context_size: int,
offload_percent: float,
quantization_bits: int = 4
) -> tuple[float, float]:
"""
Calculate VRAM and RAM usage with offloading.
Args:
base_vram_gb: Base model VRAM without context
context_size: Context window size in tokens
offload_percent: Percentage of model offloaded to RAM (0.0-1.0)
quantization_bits: Quantization precision
Returns:
(vram_usage_gb, ram_usage_gb)
"""
# Context memory (KV cache) - always in VRAM for speed
context_memory = calculate_context_memory(context_size, quantization_bits)
# Model weights split between GPU and RAM
gpu_model_memory = base_vram_gb * (1 - offload_percent)
ram_model_memory = base_vram_gb * offload_percent
# Context cache stays in VRAM for performance
vram_total = gpu_model_memory + context_memory
ram_total = ram_model_memory
return vram_total, ram_total
def get_available_memory_with_offload(
hardware: HardwareProfile,
offload_percent: float
) -> tuple[float, float]:
"""
Get available GPU VRAM and system RAM considering offloading.
Args:
hardware: Hardware profile
offload_percent: Offloading percentage
Returns:
(available_vram_gb, available_ram_gb)
"""
if hardware.gpu and not hardware.is_apple_silicon:
# External GPU - use GPU VRAM + potentially some system RAM
available_vram = hardware.gpu.vram_gb * 0.9 # 10% buffer
available_ram = hardware.ram_gb * 0.5 * offload_percent # Portion of RAM for offload
elif hardware.is_apple_silicon:
# Apple Silicon - unified memory
# Use full available memory (RAM - 4GB), not just 50%
available_total = hardware.available_memory_gb
available_vram = available_total * (1 - offload_percent)
available_ram = available_total * offload_percent
else:
# CPU only - use full available memory (RAM - 4GB safety)
# On CPU-only, there's no VRAM/RAM split, just system RAM
available_vram = hardware.available_memory_gb # Use the new limit
available_ram = 0
return available_vram, available_ram
def calculate_max_instances(available_memory_gb: float, memory_per_instance: float, optimal: bool = True) -> int:
"""
Calculate number of instances based on available memory.
Args:
available_memory_gb: Available memory in GB
memory_per_instance: Memory required per instance in GB
optimal: If True, cap at OPTIMAL_MAX_INSTANCES (3-5 sweet spot).
If False, return maximum possible (up to MAX_INSTANCES).
Returns:
Recommended number of instances (2-5 for optimal, 2-8 for max)
"""
effective_memory = available_memory_gb * MEMORY_OVERHEAD_FACTOR
max_possible = int(effective_memory // memory_per_instance)
if optimal:
# Use optimal range: 2-5 instances (research-backed sweet spot)
# 3-5 instances gives 85-90% of consensus benefit
# More than 5 has diminishing returns
if max_possible >= OPTIMAL_MAX_INSTANCES:
return OPTIMAL_MAX_INSTANCES # Cap at sweet spot
elif max_possible >= 3:
return max_possible # Use 3-4 if memory allows
else:
# Only return MIN_INSTANCES if memory actually fits that many
if max_possible >= MIN_INSTANCES:
return max_possible # Return what actually fits, not MIN_INSTANCES
return max(max_possible, 1)
else:
# Return absolute maximum (for users who explicitly want more)
return max(MIN_INSTANCES, min(max_possible, MAX_INSTANCES))
def select_optimal_model(
hardware: HardwareProfile,
preferred_model: Optional[str] = None,
force_instances: Optional[int] = None,
context_size: int = 32768,
offload_percent: float = 0.0,
use_mlx: bool = False
) -> Optional[ModelConfig]:
"""
Select the optimal model configuration for given hardware.
Args:
hardware: Hardware profile
preferred_model: Optional model ID to force (e.g., "qwen2.5-coder")
force_instances: Optional number of instances to force
context_size: Context window size in tokens (default: 32768)
offload_percent: Portion of model to offload to RAM (0.0-1.0)
use_mlx: Whether to use MLX format models (Apple Silicon)
Returns:
ModelConfig or None if no suitable model found
"""
# Auto-detect MLX if on Apple Silicon and not explicitly set
if use_mlx is None and hardware.is_apple_silicon:
use_mlx = True
# Get available memory considering offloading
available_vram, available_ram = get_available_memory_with_offload(hardware, offload_percent)
# Get models to try (with appropriate quantizations)
# Note: Don't check available quantizations here (too slow for menu rendering)
# Only check when user is actually browsing or selecting custom config
if preferred_model:
from models.registry import get_model
preferred = get_model(preferred_model, use_mlx=use_mlx, check_available=False)
models = [preferred] if preferred else []
else:
models = list_models(use_mlx=use_mlx, check_available=False)
# Note: On Apple Silicon with MLX, multiple instances work fine in sequential mode
# The swarm manager will handle sequential execution to avoid GPU conflicts
# Try each model in priority order
for model in models:
config = _try_model_with_context(model, available_vram, force_instances, context_size, offload_percent, use_mlx)
if config:
return config
# If nothing fits, try smallest variant of first model
if models:
smallest_config = _try_smallest_variant_with_context(models[0], available_vram, force_instances, context_size, offload_percent, use_mlx)
if smallest_config:
return smallest_config
return None
def _try_model_with_context(
model: Model,
available_vram: float,
force_instances: Optional[int],
context_size: int,
offload_percent: float,
use_mlx: bool = False
) -> Optional[ModelConfig]:
"""Try to fit a model in available memory with context and offloading."""
# Try variants from largest to smallest
for variant in sorted(model.variants, key=lambda v: v.base_vram_gb, reverse=True):
# Try quantizations from best to fastest
sorted_quants = sorted(
variant.quantizations,
key=lambda q: (['fast', 'good', 'better', 'best'].index(q.quality), -q.vram_gb),
reverse=True
)
for quant in sorted_quants:
# Calculate memory with context and offloading
# Extract quantization bits from name (e.g., "4bit" -> 4, "q4_k_m" -> 4)
if 'bit' in quant.name:
# MLX format: "4bit", "3bit", etc.
quantization_bits = int(quant.name.replace('bit', ''))
elif 'q4' in quant.name:
quantization_bits = 4
elif 'q5' in quant.name:
quantization_bits = 5
elif 'q6' in quant.name:
quantization_bits = 6
else:
quantization_bits = 4 # Default fallback
vram_per_instance, ram_per_instance = calculate_memory_with_offload(
quant.vram_gb, context_size, offload_percent, quantization_bits
)
# Check if at least MIN_INSTANCES can fit
min_needed = vram_per_instance * MIN_INSTANCES
if min_needed > available_vram:
continue
# Calculate instances
if force_instances:
instances = force_instances
if not use_mlx: # On non-Mac, check if all instances fit in VRAM
total_needed = vram_per_instance * instances
if total_needed > available_vram:
continue
else:
# On Mac with MLX (use_mlx=True), use 3 responses by default
# On other platforms, calculate based on VRAM
if use_mlx:
instances = 3 # Default for seed variation mode
else:
instances = calculate_max_instances(available_vram, vram_per_instance)
# On Mac with seed variation, memory doesn't multiply
if use_mlx:
total_memory = vram_per_instance + ram_per_instance
else:
total_memory = (vram_per_instance + ram_per_instance) * instances
return ModelConfig(
model=model,
variant=variant,
quantization=quant,
instances=instances,
memory_per_instance_gb=vram_per_instance + ram_per_instance,
total_memory_gb=total_memory,
context_size=context_size,
offload_percent=offload_percent,
vram_usage_gb=vram_per_instance,
ram_usage_gb=ram_per_instance
)
return None
def _try_smallest_variant_with_context(
model: Model,
available_vram: float,
force_instances: Optional[int],
context_size: int,
offload_percent: float,
use_mlx: bool = False
) -> Optional[ModelConfig]:
"""Try to fit the smallest variant with the smallest quantization."""
if not model.variants:
return None
# Get smallest variant
smallest_variant = min(model.variants, key=lambda v: v.base_vram_gb)
# Get smallest quantization
if not smallest_variant.quantizations:
return None
smallest_quant = min(smallest_variant.quantizations, key=lambda q: q.vram_gb)
# Calculate memory with context and offloading
quantization_bits = 4 if 'q4' in smallest_quant.name else (5 if 'q5' in smallest_quant.name else 6)
vram_per_instance, ram_per_instance = calculate_memory_with_offload(
smallest_quant.vram_gb, context_size, offload_percent, quantization_bits
)
# Check if even this fits
if vram_per_instance > available_vram:
return None
# On Mac with MLX, use 3 responses by default
if use_mlx:
instances = force_instances or 3
else:
instances = force_instances or calculate_max_instances(available_vram, vram_per_instance)
instances = max(instances, 1)
# On Mac with seed variation, memory doesn't multiply
if use_mlx:
total_memory = vram_per_instance + ram_per_instance
else:
total_memory = (vram_per_instance + ram_per_instance) * instances
return ModelConfig(
model=model,
variant=smallest_variant,
quantization=smallest_quant,
instances=instances,
memory_per_instance_gb=vram_per_instance + ram_per_instance,
total_memory_gb=total_memory,
context_size=context_size,
offload_percent=offload_percent,
vram_usage_gb=vram_per_instance,
ram_usage_gb=ram_per_instance
)
def format_recommendation(config: ModelConfig, hardware: HardwareProfile) -> str:
"""Format a human-readable recommendation."""
lines = [
f"Model: {config.display_name}",
f"Context Window: {config.context_size//1000}K tokens",
f"Instances: {config.instances}",
f"GPU VRAM per instance: {config.vram_usage_gb:.1f} GB",
]
# Show RAM usage if offloading
if config.offload_percent > 0:
lines.append(f"System RAM per instance: {config.ram_usage_gb:.1f} GB")
lines.extend([
f"Total memory used: {config.total_memory_gb:.1f} GB",
f"Available memory: {hardware.available_memory_gb:.1f} GB",
])
# Add instance count explanation
if config.instances == OPTIMAL_MAX_INSTANCES:
lines.append(f"Note: Using optimal instance count (3-5 = 85-90% consensus benefit)")
elif config.instances == MIN_INSTANCES:
lines.append(f"Note: Minimum instances for consensus voting")
elif config.instances < OPTIMAL_MAX_INSTANCES:
lines.append(f"Note: Limited by available memory")
else:
lines.append(f"Note: Maximum instances (diminishing returns beyond 5)")
if hardware.gpu:
if hardware.is_apple_silicon:
lines.append(f"Hardware: Apple Silicon ({hardware.gpu.name})")
else:
lines.append(f"GPU: {hardware.gpu.name} ({hardware.gpu.vram_gb:.1f} GB)")
else:
lines.append("Hardware: CPU-only mode")
return "\n".join(lines)