Files
local_swarm/src/models/selector.py
T
sleepy dcca89d89a fix: OpenAI API compatibility for hollama and other clients
- Fixed ChatMessage.tool_calls to be Optional with default None (excluded when empty)
- Added logprobs field to ChatCompletionChoice (always included as null)
- Added stats and system_fingerprint to ChatCompletionResponse
- Fixed streaming response to use delta format (not message format)
- Fixed non-streaming response to include logprobs: null
- Updated tool instructions to include 'NO explanations'
- Added pytest-asyncio markers to async tests
- All 41 tests passing

This fixes the 'Cannot read properties of undefined (reading content)' error in hollama and ensures compatibility with OpenAI clients.
2026-02-25 19:39:05 +01:00

331 lines
12 KiB
Python

"""Model selection logic for Local Swarm."""
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, List
from hardware.detector import HardwareProfile
from models.registry import Model, ModelVariant, QuantizationConfig, list_models
from models.memory_calculator import (
calculate_memory_with_offload,
get_available_memory_with_offload,
calculate_max_instances
)
@dataclass
class ModelConfig:
"""Configuration for running a model in the swarm."""
model: Model
variant: ModelVariant
quantization: QuantizationConfig
instances: int
memory_per_instance_gb: float
total_memory_gb: float
context_size: int = 32768
offload_percent: float = 0.0
vram_usage_gb: float = 0.0
ram_usage_gb: float = 0.0
def __post_init__(self):
"""Ensure default values are set if not provided."""
if not hasattr(self, 'context_size') or self.context_size is None:
object.__setattr__(self, 'context_size', 32768)
if not hasattr(self, 'offload_percent') or self.offload_percent is None:
object.__setattr__(self, 'offload_percent', 0.0)
def __repr__(self) -> str:
return (f"ModelConfig({self.model.name} {self.variant.size} "
f"{self.quantization.name}, {self.instances} instances, "
f"{self.total_memory_gb:.1f}GB total)")
@property
def model_id(self) -> str:
return f"{self.model.id}:{self.variant.size}:{self.quantization.name}"
@property
def display_name(self) -> str:
offload_str = f"+{int(self.offload_percent*100)}% offload" if self.offload_percent > 0 else ""
return f"{self.model.name} {self.variant.size} ({self.quantization.name}, {self.context_size//1000}K ctx{offload_str})"
# Load configuration from JSON
_config_path = Path(__file__).parent.parent.parent / "config" / "models" / "selector_config.json"
_config = {}
if _config_path.exists():
with open(_config_path, 'r') as f:
_config = json.load(f)
# Extract constraints
_constraints = _config.get("constraints", {})
MIN_INSTANCES = _constraints.get("min_instances", 1)
MAX_INSTANCES = _constraints.get("max_instances", 8)
OPTIMAL_MAX_INSTANCES = _constraints.get("optimal_max_instances", 5)
MEMORY_OVERHEAD_FACTOR = _constraints.get("memory_overhead_factor", 0.95)
MLX_MAX_INSTANCES = _constraints.get("mlx_max_instances", 1)
# Context and offload options
CONTEXT_OPTIONS = {int(k): v for k, v in _config.get("context_options", {}).items()}
OFFLOAD_OPTIONS = {float(k): v for k, v in _config.get("offload_options", {}).items()}
def select_optimal_model(
hardware: HardwareProfile,
preferred_model: Optional[str] = None,
force_instances: Optional[int] = None,
context_size: int = 32768,
offload_percent: float = 0.0,
use_mlx: Optional[bool] = None
) -> Optional[ModelConfig]:
"""Select the optimal model configuration for given hardware."""
# Auto-detect MLX usage for Apple Silicon if not explicitly set
if use_mlx is None:
use_mlx = hardware.is_apple_silicon
available_vram, _ = get_available_memory_with_offload(hardware, offload_percent)
if preferred_model:
config = _handle_preferred_model(
preferred_model, hardware, available_vram, force_instances,
context_size, offload_percent, use_mlx
)
if config:
return config
models = list_models(use_mlx=use_mlx)
for model in models:
config = _try_model_with_context(model, available_vram, force_instances, context_size, offload_percent, use_mlx)
if config:
return config
if models:
return _try_smallest_variant_with_context(models[0], available_vram, force_instances, context_size, offload_percent, use_mlx)
return None
def _handle_preferred_model(
preferred_model: str,
hardware: HardwareProfile,
available_vram: float,
force_instances: Optional[int],
context_size: int,
offload_percent: float,
use_mlx: bool
) -> Optional[ModelConfig]:
"""Handle preferred model selection."""
from models.registry import get_model
model_id = preferred_model
preferred_size = None
preferred_quant = None
if ':' in preferred_model:
parts = preferred_model.split(':')
if len(parts) >= 3:
model_id = parts[0]
preferred_size = parts[1]
preferred_quant = parts[2]
preferred = get_model(model_id, use_mlx=use_mlx)
if not preferred:
return None
if preferred_size and preferred_quant:
return _try_specific_config(
preferred, preferred_size, preferred_quant, available_vram,
force_instances, context_size, offload_percent, use_mlx
)
models = [preferred]
for model in models:
config = _try_model_with_context(model, available_vram, force_instances, context_size, offload_percent, use_mlx)
if config:
return config
return None
def _try_specific_config(
model: Model,
preferred_size: str,
preferred_quant: str,
available_vram: float,
force_instances: Optional[int],
context_size: int,
offload_percent: float,
use_mlx: bool
) -> Optional[ModelConfig]:
"""Try to use a specific model configuration."""
for variant in model.variants:
if variant.size.lower() == preferred_size.lower():
for quant in variant.quantizations:
if quant.name.lower() == preferred_quant.lower():
if quant.vram_gb <= available_vram:
instances = force_instances or calculate_max_instances(available_vram, quant.vram_gb, optimal=True)
if use_mlx:
instances = min(instances, MLX_MAX_INSTANCES)
return ModelConfig(
model=model,
variant=variant,
quantization=quant,
instances=instances,
memory_per_instance_gb=quant.vram_gb,
total_memory_gb=quant.vram_gb * instances,
context_size=context_size,
offload_percent=offload_percent,
vram_usage_gb=quant.vram_gb,
ram_usage_gb=0.0
)
else:
print(f"\n⚠️ Requested model requires {quant.vram_gb:.1f}GB but only {available_vram:.1f}GB available")
return None
print(f"\n⚠️ Model configuration not found: {model.id}:{preferred_size}:{preferred_quant}")
return None
def _try_model_with_context(
model: Model,
available_vram: float,
force_instances: Optional[int],
context_size: int,
offload_percent: float,
use_mlx: bool = False
) -> Optional[ModelConfig]:
"""Try to fit a model in available memory with context and offloading."""
for variant in sorted(model.variants, key=lambda v: v.base_vram_gb, reverse=True):
sorted_quants = sorted(
variant.quantizations,
key=lambda q: (['fast', 'good', 'better', 'best'].index(q.quality), -q.vram_gb),
reverse=True
)
for quant in sorted_quants:
if 'bit' in quant.name:
quantization_bits = int(quant.name.replace('bit', ''))
elif 'q4' in quant.name:
quantization_bits = 4
elif 'q5' in quant.name:
quantization_bits = 5
elif 'q6' in quant.name:
quantization_bits = 6
else:
quantization_bits = 4
vram_per_instance, ram_per_instance = calculate_memory_with_offload(
quant.vram_gb, context_size, offload_percent, quantization_bits
)
if vram_per_instance * MIN_INSTANCES > available_vram:
continue
if force_instances:
instances = force_instances
if not use_mlx and vram_per_instance * instances > available_vram:
continue
else:
instances = 1 if use_mlx else calculate_max_instances(available_vram, vram_per_instance)
total_memory = vram_per_instance + ram_per_instance if use_mlx else (vram_per_instance + ram_per_instance) * instances
return ModelConfig(
model=model,
variant=variant,
quantization=quant,
instances=instances,
memory_per_instance_gb=vram_per_instance + ram_per_instance,
total_memory_gb=total_memory,
context_size=context_size,
offload_percent=offload_percent,
vram_usage_gb=vram_per_instance,
ram_usage_gb=ram_per_instance
)
return None
def _try_smallest_variant_with_context(
model: Model,
available_vram: float,
force_instances: Optional[int],
context_size: int,
offload_percent: float,
use_mlx: bool = False
) -> Optional[ModelConfig]:
"""Try to fit the smallest variant with the smallest quantization."""
if not model.variants:
return None
smallest_variant = min(model.variants, key=lambda v: v.base_vram_gb)
if not smallest_variant.quantizations:
return None
smallest_quant = min(smallest_variant.quantizations, key=lambda q: q.vram_gb)
quantization_bits = 4 if 'q4' in smallest_quant.name else (5 if 'q5' in smallest_quant.name else 6)
vram_per_instance, ram_per_instance = calculate_memory_with_offload(
smallest_quant.vram_gb, context_size, offload_percent, quantization_bits
)
if vram_per_instance > available_vram:
return None
instances = force_instances or (1 if use_mlx else calculate_max_instances(available_vram, vram_per_instance))
instances = max(instances, 1)
total_memory = vram_per_instance + ram_per_instance if use_mlx else (vram_per_instance + ram_per_instance) * instances
return ModelConfig(
model=model,
variant=smallest_variant,
quantization=smallest_quant,
instances=instances,
memory_per_instance_gb=vram_per_instance + ram_per_instance,
total_memory_gb=total_memory,
context_size=context_size,
offload_percent=offload_percent,
vram_usage_gb=vram_per_instance,
ram_usage_gb=ram_per_instance
)
def format_recommendation(config: ModelConfig, hardware: HardwareProfile) -> str:
"""Format a human-readable recommendation."""
lines = [
f"Model: {config.display_name}",
f"Context Window: {config.context_size//1000}K tokens",
f"Instances: {config.instances}",
f"GPU VRAM per instance: {config.vram_usage_gb:.1f} GB",
]
if config.offload_percent > 0:
lines.append(f"System RAM per instance: {config.ram_usage_gb:.1f} GB")
lines.extend([
f"Total memory used: {config.total_memory_gb:.1f} GB",
f"Available memory: {hardware.available_memory_gb:.1f} GB",
])
if config.instances == OPTIMAL_MAX_INSTANCES:
lines.append(f"Note: Using optimal instance count (3-5 = 85-90% consensus benefit)")
elif config.instances == MIN_INSTANCES:
lines.append(f"Note: Minimum instances for consensus voting")
elif config.instances < OPTIMAL_MAX_INSTANCES:
lines.append(f"Note: Limited by available memory")
else:
lines.append(f"Note: Maximum instances (diminishing returns beyond 5)")
if hardware.gpu:
if hardware.is_apple_silicon:
lines.append(f"Hardware: Apple Silicon ({hardware.gpu.name})")
else:
lines.append(f"GPU: {hardware.gpu.name} ({hardware.gpu.vram_gb:.1f} GB)")
else:
lines.append("Hardware: CPU-only mode")
return "\n".join(lines)