2461f45ca8
- select_optimal_model was checking HF API for available quantizations - This caused menu to hang/slow down when changing context - Now only checks availability when browsing or custom config - Recommended config uses default quantizations (faster)
419 lines
16 KiB
Python
419 lines
16 KiB
Python
"""Model selection logic for Local Swarm."""
|
|
|
|
from dataclasses import dataclass
|
|
from typing import Optional, List
|
|
from hardware.detector import HardwareProfile
|
|
from models.registry import Model, ModelVariant, QuantizationConfig, list_models
|
|
|
|
|
|
@dataclass
|
|
class ModelConfig:
|
|
"""Configuration for running a model in the swarm."""
|
|
model: Model
|
|
variant: ModelVariant
|
|
quantization: QuantizationConfig
|
|
instances: int
|
|
memory_per_instance_gb: float
|
|
total_memory_gb: float
|
|
context_size: int = 32768 # Context window in tokens (16K, 32K, 64K, 128K)
|
|
offload_percent: float = 0.0 # Percentage of layers offloaded to RAM (0.0, 0.2, 0.5)
|
|
vram_usage_gb: float = 0.0 # Actual VRAM usage per instance
|
|
ram_usage_gb: float = 0.0 # System RAM usage per instance (when offloading)
|
|
|
|
def __post_init__(self):
|
|
"""Ensure default values are set if not provided."""
|
|
if not hasattr(self, 'context_size') or self.context_size is None:
|
|
object.__setattr__(self, 'context_size', 32768)
|
|
if not hasattr(self, 'offload_percent') or self.offload_percent is None:
|
|
object.__setattr__(self, 'offload_percent', 0.0)
|
|
|
|
def __repr__(self) -> str:
|
|
return (f"ModelConfig({self.model.name} {self.variant.size} "
|
|
f"{self.quantization.name}, {self.instances} instances, "
|
|
f"{self.total_memory_gb:.1f}GB total)")
|
|
|
|
@property
|
|
def model_id(self) -> str:
|
|
return f"{self.model.id}:{self.variant.size}:{self.quantization.name}"
|
|
|
|
@property
|
|
def display_name(self) -> str:
|
|
offload_str = f"+{int(self.offload_percent*100)}% offload" if self.offload_percent > 0 else ""
|
|
return f"{self.model.name} {self.variant.size} ({self.quantization.name}, {self.context_size//1000}K ctx{offload_str})"
|
|
|
|
|
|
# Configuration constraints
|
|
MIN_INSTANCES = 1 # Allow 1 instance (needed for Apple Silicon MLX)
|
|
MAX_INSTANCES = 8
|
|
OPTIMAL_MAX_INSTANCES = 5 # Sweet spot for consensus (85-90% benefit)
|
|
MEMORY_OVERHEAD_FACTOR = 0.95 # Leave 5% buffer
|
|
|
|
# Apple Silicon MLX constraints - MLX uses GPU efficiently with 1 worker
|
|
MLX_MAX_INSTANCES = 1 # MLX handles all GPU resources in single instance
|
|
|
|
|
|
# Context window options
|
|
CONTEXT_OPTIONS = {
|
|
16384: "16K tokens",
|
|
32768: "32K tokens (default)",
|
|
65536: "64K tokens",
|
|
131072: "128K tokens"
|
|
}
|
|
|
|
# Offloading options
|
|
OFFLOAD_OPTIONS = {
|
|
0.0: "No offload (default) - 100% GPU",
|
|
0.2: "20% offload - 80% GPU, 20% RAM",
|
|
0.5: "50% offload - 50% GPU, 50% RAM"
|
|
}
|
|
|
|
|
|
def calculate_context_memory(context_size: int, quantization_bits: int = 4) -> float:
|
|
"""
|
|
Calculate additional memory needed for KV cache based on context size.
|
|
|
|
Args:
|
|
context_size: Number of tokens in context window
|
|
quantization_bits: Quantization bits (4 for Q4, 5 for Q5, etc.)
|
|
|
|
Returns:
|
|
Additional VRAM needed in GB
|
|
"""
|
|
# KV cache memory per token: 2 * num_layers * hidden_dim * bytes_per_param
|
|
# Rough estimate: ~0.5MB per 1K tokens for 4-bit quantization
|
|
bytes_per_token = (quantization_bits / 8) * 0.5 # 0.5 MB base per token at fp16
|
|
memory_mb = (context_size / 1000) * bytes_per_token * 1000
|
|
return memory_mb / 1024 # Convert to GB
|
|
|
|
|
|
def calculate_memory_with_offload(
|
|
base_vram_gb: float,
|
|
context_size: int,
|
|
offload_percent: float,
|
|
quantization_bits: int = 4
|
|
) -> tuple[float, float]:
|
|
"""
|
|
Calculate VRAM and RAM usage with offloading.
|
|
|
|
Args:
|
|
base_vram_gb: Base model VRAM without context
|
|
context_size: Context window size in tokens
|
|
offload_percent: Percentage of model offloaded to RAM (0.0-1.0)
|
|
quantization_bits: Quantization precision
|
|
|
|
Returns:
|
|
(vram_usage_gb, ram_usage_gb)
|
|
"""
|
|
# Context memory (KV cache) - always in VRAM for speed
|
|
context_memory = calculate_context_memory(context_size, quantization_bits)
|
|
|
|
# Model weights split between GPU and RAM
|
|
gpu_model_memory = base_vram_gb * (1 - offload_percent)
|
|
ram_model_memory = base_vram_gb * offload_percent
|
|
|
|
# Context cache stays in VRAM for performance
|
|
vram_total = gpu_model_memory + context_memory
|
|
ram_total = ram_model_memory
|
|
|
|
return vram_total, ram_total
|
|
|
|
|
|
def get_available_memory_with_offload(
|
|
hardware: HardwareProfile,
|
|
offload_percent: float
|
|
) -> tuple[float, float]:
|
|
"""
|
|
Get available GPU VRAM and system RAM considering offloading.
|
|
|
|
Args:
|
|
hardware: Hardware profile
|
|
offload_percent: Offloading percentage
|
|
|
|
Returns:
|
|
(available_vram_gb, available_ram_gb)
|
|
"""
|
|
if hardware.gpu and not hardware.is_apple_silicon:
|
|
# External GPU - use GPU VRAM + potentially some system RAM
|
|
available_vram = hardware.gpu.vram_gb * 0.9 # 10% buffer
|
|
available_ram = hardware.ram_gb * 0.5 * offload_percent # Portion of RAM for offload
|
|
elif hardware.is_apple_silicon:
|
|
# Apple Silicon - unified memory
|
|
# Use full available memory (RAM - 4GB), not just 50%
|
|
available_total = hardware.available_memory_gb
|
|
available_vram = available_total * (1 - offload_percent)
|
|
available_ram = available_total * offload_percent
|
|
else:
|
|
# CPU only - use full available memory (RAM - 4GB safety)
|
|
# On CPU-only, there's no VRAM/RAM split, just system RAM
|
|
available_vram = hardware.available_memory_gb # Use the new limit
|
|
available_ram = 0
|
|
|
|
return available_vram, available_ram
|
|
|
|
|
|
def calculate_max_instances(available_memory_gb: float, memory_per_instance: float, optimal: bool = True) -> int:
|
|
"""
|
|
Calculate number of instances based on available memory.
|
|
|
|
Args:
|
|
available_memory_gb: Available memory in GB
|
|
memory_per_instance: Memory required per instance in GB
|
|
optimal: If True, cap at OPTIMAL_MAX_INSTANCES (3-5 sweet spot).
|
|
If False, return maximum possible (up to MAX_INSTANCES).
|
|
|
|
Returns:
|
|
Recommended number of instances (2-5 for optimal, 2-8 for max)
|
|
"""
|
|
effective_memory = available_memory_gb * MEMORY_OVERHEAD_FACTOR
|
|
max_possible = int(effective_memory // memory_per_instance)
|
|
|
|
if optimal:
|
|
# Use optimal range: 2-5 instances (research-backed sweet spot)
|
|
# 3-5 instances gives 85-90% of consensus benefit
|
|
# More than 5 has diminishing returns
|
|
if max_possible >= OPTIMAL_MAX_INSTANCES:
|
|
return OPTIMAL_MAX_INSTANCES # Cap at sweet spot
|
|
elif max_possible >= 3:
|
|
return max_possible # Use 3-4 if memory allows
|
|
else:
|
|
# Only return MIN_INSTANCES if memory actually fits that many
|
|
if max_possible >= MIN_INSTANCES:
|
|
return max_possible # Return what actually fits, not MIN_INSTANCES
|
|
return max(max_possible, 1)
|
|
else:
|
|
# Return absolute maximum (for users who explicitly want more)
|
|
return max(MIN_INSTANCES, min(max_possible, MAX_INSTANCES))
|
|
|
|
|
|
def select_optimal_model(
|
|
hardware: HardwareProfile,
|
|
preferred_model: Optional[str] = None,
|
|
force_instances: Optional[int] = None,
|
|
context_size: int = 32768,
|
|
offload_percent: float = 0.0,
|
|
use_mlx: bool = False
|
|
) -> Optional[ModelConfig]:
|
|
"""
|
|
Select the optimal model configuration for given hardware.
|
|
|
|
Args:
|
|
hardware: Hardware profile
|
|
preferred_model: Optional model ID to force (e.g., "qwen2.5-coder")
|
|
force_instances: Optional number of instances to force
|
|
context_size: Context window size in tokens (default: 32768)
|
|
offload_percent: Portion of model to offload to RAM (0.0-1.0)
|
|
use_mlx: Whether to use MLX format models (Apple Silicon)
|
|
|
|
Returns:
|
|
ModelConfig or None if no suitable model found
|
|
"""
|
|
# Auto-detect MLX if on Apple Silicon and not explicitly set
|
|
if use_mlx is None and hardware.is_apple_silicon:
|
|
use_mlx = True
|
|
|
|
# Get available memory considering offloading
|
|
available_vram, available_ram = get_available_memory_with_offload(hardware, offload_percent)
|
|
|
|
# Get models to try (with appropriate quantizations)
|
|
# Note: Don't check available quantizations here (too slow for menu rendering)
|
|
# Only check when user is actually browsing or selecting custom config
|
|
if preferred_model:
|
|
from models.registry import get_model
|
|
preferred = get_model(preferred_model, use_mlx=use_mlx, check_available=False)
|
|
models = [preferred] if preferred else []
|
|
else:
|
|
models = list_models(use_mlx=use_mlx, check_available=False)
|
|
|
|
# Note: On Apple Silicon with MLX, multiple instances work fine in sequential mode
|
|
# The swarm manager will handle sequential execution to avoid GPU conflicts
|
|
|
|
# Try each model in priority order
|
|
for model in models:
|
|
config = _try_model_with_context(model, available_vram, force_instances, context_size, offload_percent, use_mlx)
|
|
if config:
|
|
return config
|
|
|
|
# If nothing fits, try smallest variant of first model
|
|
if models:
|
|
smallest_config = _try_smallest_variant_with_context(models[0], available_vram, force_instances, context_size, offload_percent, use_mlx)
|
|
if smallest_config:
|
|
return smallest_config
|
|
|
|
return None
|
|
|
|
|
|
def _try_model_with_context(
|
|
model: Model,
|
|
available_vram: float,
|
|
force_instances: Optional[int],
|
|
context_size: int,
|
|
offload_percent: float,
|
|
use_mlx: bool = False
|
|
) -> Optional[ModelConfig]:
|
|
"""Try to fit a model in available memory with context and offloading."""
|
|
# Try variants from largest to smallest
|
|
for variant in sorted(model.variants, key=lambda v: v.base_vram_gb, reverse=True):
|
|
# Try quantizations from best to fastest
|
|
sorted_quants = sorted(
|
|
variant.quantizations,
|
|
key=lambda q: (['fast', 'good', 'better', 'best'].index(q.quality), -q.vram_gb),
|
|
reverse=True
|
|
)
|
|
|
|
for quant in sorted_quants:
|
|
# Calculate memory with context and offloading
|
|
# Extract quantization bits from name (e.g., "4bit" -> 4, "q4_k_m" -> 4)
|
|
if 'bit' in quant.name:
|
|
# MLX format: "4bit", "3bit", etc.
|
|
quantization_bits = int(quant.name.replace('bit', ''))
|
|
elif 'q4' in quant.name:
|
|
quantization_bits = 4
|
|
elif 'q5' in quant.name:
|
|
quantization_bits = 5
|
|
elif 'q6' in quant.name:
|
|
quantization_bits = 6
|
|
else:
|
|
quantization_bits = 4 # Default fallback
|
|
|
|
vram_per_instance, ram_per_instance = calculate_memory_with_offload(
|
|
quant.vram_gb, context_size, offload_percent, quantization_bits
|
|
)
|
|
|
|
# Check if at least MIN_INSTANCES can fit
|
|
min_needed = vram_per_instance * MIN_INSTANCES
|
|
if min_needed > available_vram:
|
|
continue
|
|
|
|
# Calculate instances
|
|
if force_instances:
|
|
instances = force_instances
|
|
if not use_mlx: # On non-Mac, check if all instances fit in VRAM
|
|
total_needed = vram_per_instance * instances
|
|
if total_needed > available_vram:
|
|
continue
|
|
else:
|
|
# On Mac with MLX (use_mlx=True), use 3 responses by default
|
|
# On other platforms, calculate based on VRAM
|
|
if use_mlx:
|
|
instances = 3 # Default for seed variation mode
|
|
else:
|
|
instances = calculate_max_instances(available_vram, vram_per_instance)
|
|
|
|
# On Mac with seed variation, memory doesn't multiply
|
|
if use_mlx:
|
|
total_memory = vram_per_instance + ram_per_instance
|
|
else:
|
|
total_memory = (vram_per_instance + ram_per_instance) * instances
|
|
|
|
return ModelConfig(
|
|
model=model,
|
|
variant=variant,
|
|
quantization=quant,
|
|
instances=instances,
|
|
memory_per_instance_gb=vram_per_instance + ram_per_instance,
|
|
total_memory_gb=total_memory,
|
|
context_size=context_size,
|
|
offload_percent=offload_percent,
|
|
vram_usage_gb=vram_per_instance,
|
|
ram_usage_gb=ram_per_instance
|
|
)
|
|
|
|
return None
|
|
|
|
|
|
def _try_smallest_variant_with_context(
|
|
model: Model,
|
|
available_vram: float,
|
|
force_instances: Optional[int],
|
|
context_size: int,
|
|
offload_percent: float,
|
|
use_mlx: bool = False
|
|
) -> Optional[ModelConfig]:
|
|
"""Try to fit the smallest variant with the smallest quantization."""
|
|
if not model.variants:
|
|
return None
|
|
|
|
# Get smallest variant
|
|
smallest_variant = min(model.variants, key=lambda v: v.base_vram_gb)
|
|
|
|
# Get smallest quantization
|
|
if not smallest_variant.quantizations:
|
|
return None
|
|
|
|
smallest_quant = min(smallest_variant.quantizations, key=lambda q: q.vram_gb)
|
|
|
|
# Calculate memory with context and offloading
|
|
quantization_bits = 4 if 'q4' in smallest_quant.name else (5 if 'q5' in smallest_quant.name else 6)
|
|
vram_per_instance, ram_per_instance = calculate_memory_with_offload(
|
|
smallest_quant.vram_gb, context_size, offload_percent, quantization_bits
|
|
)
|
|
|
|
# Check if even this fits
|
|
if vram_per_instance > available_vram:
|
|
return None
|
|
|
|
# On Mac with MLX, use 3 responses by default
|
|
if use_mlx:
|
|
instances = force_instances or 3
|
|
else:
|
|
instances = force_instances or calculate_max_instances(available_vram, vram_per_instance)
|
|
instances = max(instances, 1)
|
|
|
|
# On Mac with seed variation, memory doesn't multiply
|
|
if use_mlx:
|
|
total_memory = vram_per_instance + ram_per_instance
|
|
else:
|
|
total_memory = (vram_per_instance + ram_per_instance) * instances
|
|
|
|
return ModelConfig(
|
|
model=model,
|
|
variant=smallest_variant,
|
|
quantization=smallest_quant,
|
|
instances=instances,
|
|
memory_per_instance_gb=vram_per_instance + ram_per_instance,
|
|
total_memory_gb=total_memory,
|
|
context_size=context_size,
|
|
offload_percent=offload_percent,
|
|
vram_usage_gb=vram_per_instance,
|
|
ram_usage_gb=ram_per_instance
|
|
)
|
|
|
|
|
|
def format_recommendation(config: ModelConfig, hardware: HardwareProfile) -> str:
|
|
"""Format a human-readable recommendation."""
|
|
lines = [
|
|
f"Model: {config.display_name}",
|
|
f"Context Window: {config.context_size//1000}K tokens",
|
|
f"Instances: {config.instances}",
|
|
f"GPU VRAM per instance: {config.vram_usage_gb:.1f} GB",
|
|
]
|
|
|
|
# Show RAM usage if offloading
|
|
if config.offload_percent > 0:
|
|
lines.append(f"System RAM per instance: {config.ram_usage_gb:.1f} GB")
|
|
|
|
lines.extend([
|
|
f"Total memory used: {config.total_memory_gb:.1f} GB",
|
|
f"Available memory: {hardware.available_memory_gb:.1f} GB",
|
|
])
|
|
|
|
# Add instance count explanation
|
|
if config.instances == OPTIMAL_MAX_INSTANCES:
|
|
lines.append(f"Note: Using optimal instance count (3-5 = 85-90% consensus benefit)")
|
|
elif config.instances == MIN_INSTANCES:
|
|
lines.append(f"Note: Minimum instances for consensus voting")
|
|
elif config.instances < OPTIMAL_MAX_INSTANCES:
|
|
lines.append(f"Note: Limited by available memory")
|
|
else:
|
|
lines.append(f"Note: Maximum instances (diminishing returns beyond 5)")
|
|
|
|
if hardware.gpu:
|
|
if hardware.is_apple_silicon:
|
|
lines.append(f"Hardware: Apple Silicon ({hardware.gpu.name})")
|
|
else:
|
|
lines.append(f"GPU: {hardware.gpu.name} ({hardware.gpu.vram_gb:.1f} GB)")
|
|
else:
|
|
lines.append("Hardware: CPU-only mode")
|
|
|
|
return "\n".join(lines)
|