dcca89d89a
- Fixed ChatMessage.tool_calls to be Optional with default None (excluded when empty) - Added logprobs field to ChatCompletionChoice (always included as null) - Added stats and system_fingerprint to ChatCompletionResponse - Fixed streaming response to use delta format (not message format) - Fixed non-streaming response to include logprobs: null - Updated tool instructions to include 'NO explanations' - Added pytest-asyncio markers to async tests - All 41 tests passing This fixes the 'Cannot read properties of undefined (reading content)' error in hollama and ensures compatibility with OpenAI clients.
331 lines
12 KiB
Python
331 lines
12 KiB
Python
"""Model selection logic for Local Swarm."""
|
|
|
|
import json
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Optional, List
|
|
|
|
from hardware.detector import HardwareProfile
|
|
from models.registry import Model, ModelVariant, QuantizationConfig, list_models
|
|
from models.memory_calculator import (
|
|
calculate_memory_with_offload,
|
|
get_available_memory_with_offload,
|
|
calculate_max_instances
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class ModelConfig:
|
|
"""Configuration for running a model in the swarm."""
|
|
model: Model
|
|
variant: ModelVariant
|
|
quantization: QuantizationConfig
|
|
instances: int
|
|
memory_per_instance_gb: float
|
|
total_memory_gb: float
|
|
context_size: int = 32768
|
|
offload_percent: float = 0.0
|
|
vram_usage_gb: float = 0.0
|
|
ram_usage_gb: float = 0.0
|
|
|
|
def __post_init__(self):
|
|
"""Ensure default values are set if not provided."""
|
|
if not hasattr(self, 'context_size') or self.context_size is None:
|
|
object.__setattr__(self, 'context_size', 32768)
|
|
if not hasattr(self, 'offload_percent') or self.offload_percent is None:
|
|
object.__setattr__(self, 'offload_percent', 0.0)
|
|
|
|
def __repr__(self) -> str:
|
|
return (f"ModelConfig({self.model.name} {self.variant.size} "
|
|
f"{self.quantization.name}, {self.instances} instances, "
|
|
f"{self.total_memory_gb:.1f}GB total)")
|
|
|
|
@property
|
|
def model_id(self) -> str:
|
|
return f"{self.model.id}:{self.variant.size}:{self.quantization.name}"
|
|
|
|
@property
|
|
def display_name(self) -> str:
|
|
offload_str = f"+{int(self.offload_percent*100)}% offload" if self.offload_percent > 0 else ""
|
|
return f"{self.model.name} {self.variant.size} ({self.quantization.name}, {self.context_size//1000}K ctx{offload_str})"
|
|
|
|
|
|
# Load configuration from JSON
|
|
_config_path = Path(__file__).parent.parent.parent / "config" / "models" / "selector_config.json"
|
|
_config = {}
|
|
if _config_path.exists():
|
|
with open(_config_path, 'r') as f:
|
|
_config = json.load(f)
|
|
|
|
# Extract constraints
|
|
_constraints = _config.get("constraints", {})
|
|
MIN_INSTANCES = _constraints.get("min_instances", 1)
|
|
MAX_INSTANCES = _constraints.get("max_instances", 8)
|
|
OPTIMAL_MAX_INSTANCES = _constraints.get("optimal_max_instances", 5)
|
|
MEMORY_OVERHEAD_FACTOR = _constraints.get("memory_overhead_factor", 0.95)
|
|
MLX_MAX_INSTANCES = _constraints.get("mlx_max_instances", 1)
|
|
|
|
# Context and offload options
|
|
CONTEXT_OPTIONS = {int(k): v for k, v in _config.get("context_options", {}).items()}
|
|
OFFLOAD_OPTIONS = {float(k): v for k, v in _config.get("offload_options", {}).items()}
|
|
|
|
|
|
def select_optimal_model(
|
|
hardware: HardwareProfile,
|
|
preferred_model: Optional[str] = None,
|
|
force_instances: Optional[int] = None,
|
|
context_size: int = 32768,
|
|
offload_percent: float = 0.0,
|
|
use_mlx: Optional[bool] = None
|
|
) -> Optional[ModelConfig]:
|
|
"""Select the optimal model configuration for given hardware."""
|
|
# Auto-detect MLX usage for Apple Silicon if not explicitly set
|
|
if use_mlx is None:
|
|
use_mlx = hardware.is_apple_silicon
|
|
|
|
available_vram, _ = get_available_memory_with_offload(hardware, offload_percent)
|
|
|
|
if preferred_model:
|
|
config = _handle_preferred_model(
|
|
preferred_model, hardware, available_vram, force_instances,
|
|
context_size, offload_percent, use_mlx
|
|
)
|
|
if config:
|
|
return config
|
|
|
|
models = list_models(use_mlx=use_mlx)
|
|
|
|
for model in models:
|
|
config = _try_model_with_context(model, available_vram, force_instances, context_size, offload_percent, use_mlx)
|
|
if config:
|
|
return config
|
|
|
|
if models:
|
|
return _try_smallest_variant_with_context(models[0], available_vram, force_instances, context_size, offload_percent, use_mlx)
|
|
|
|
return None
|
|
|
|
|
|
def _handle_preferred_model(
|
|
preferred_model: str,
|
|
hardware: HardwareProfile,
|
|
available_vram: float,
|
|
force_instances: Optional[int],
|
|
context_size: int,
|
|
offload_percent: float,
|
|
use_mlx: bool
|
|
) -> Optional[ModelConfig]:
|
|
"""Handle preferred model selection."""
|
|
from models.registry import get_model
|
|
|
|
model_id = preferred_model
|
|
preferred_size = None
|
|
preferred_quant = None
|
|
|
|
if ':' in preferred_model:
|
|
parts = preferred_model.split(':')
|
|
if len(parts) >= 3:
|
|
model_id = parts[0]
|
|
preferred_size = parts[1]
|
|
preferred_quant = parts[2]
|
|
|
|
preferred = get_model(model_id, use_mlx=use_mlx)
|
|
if not preferred:
|
|
return None
|
|
|
|
if preferred_size and preferred_quant:
|
|
return _try_specific_config(
|
|
preferred, preferred_size, preferred_quant, available_vram,
|
|
force_instances, context_size, offload_percent, use_mlx
|
|
)
|
|
|
|
models = [preferred]
|
|
for model in models:
|
|
config = _try_model_with_context(model, available_vram, force_instances, context_size, offload_percent, use_mlx)
|
|
if config:
|
|
return config
|
|
|
|
return None
|
|
|
|
|
|
def _try_specific_config(
|
|
model: Model,
|
|
preferred_size: str,
|
|
preferred_quant: str,
|
|
available_vram: float,
|
|
force_instances: Optional[int],
|
|
context_size: int,
|
|
offload_percent: float,
|
|
use_mlx: bool
|
|
) -> Optional[ModelConfig]:
|
|
"""Try to use a specific model configuration."""
|
|
for variant in model.variants:
|
|
if variant.size.lower() == preferred_size.lower():
|
|
for quant in variant.quantizations:
|
|
if quant.name.lower() == preferred_quant.lower():
|
|
if quant.vram_gb <= available_vram:
|
|
instances = force_instances or calculate_max_instances(available_vram, quant.vram_gb, optimal=True)
|
|
if use_mlx:
|
|
instances = min(instances, MLX_MAX_INSTANCES)
|
|
|
|
return ModelConfig(
|
|
model=model,
|
|
variant=variant,
|
|
quantization=quant,
|
|
instances=instances,
|
|
memory_per_instance_gb=quant.vram_gb,
|
|
total_memory_gb=quant.vram_gb * instances,
|
|
context_size=context_size,
|
|
offload_percent=offload_percent,
|
|
vram_usage_gb=quant.vram_gb,
|
|
ram_usage_gb=0.0
|
|
)
|
|
else:
|
|
print(f"\n⚠️ Requested model requires {quant.vram_gb:.1f}GB but only {available_vram:.1f}GB available")
|
|
return None
|
|
|
|
print(f"\n⚠️ Model configuration not found: {model.id}:{preferred_size}:{preferred_quant}")
|
|
return None
|
|
|
|
|
|
def _try_model_with_context(
|
|
model: Model,
|
|
available_vram: float,
|
|
force_instances: Optional[int],
|
|
context_size: int,
|
|
offload_percent: float,
|
|
use_mlx: bool = False
|
|
) -> Optional[ModelConfig]:
|
|
"""Try to fit a model in available memory with context and offloading."""
|
|
for variant in sorted(model.variants, key=lambda v: v.base_vram_gb, reverse=True):
|
|
sorted_quants = sorted(
|
|
variant.quantizations,
|
|
key=lambda q: (['fast', 'good', 'better', 'best'].index(q.quality), -q.vram_gb),
|
|
reverse=True
|
|
)
|
|
|
|
for quant in sorted_quants:
|
|
if 'bit' in quant.name:
|
|
quantization_bits = int(quant.name.replace('bit', ''))
|
|
elif 'q4' in quant.name:
|
|
quantization_bits = 4
|
|
elif 'q5' in quant.name:
|
|
quantization_bits = 5
|
|
elif 'q6' in quant.name:
|
|
quantization_bits = 6
|
|
else:
|
|
quantization_bits = 4
|
|
|
|
vram_per_instance, ram_per_instance = calculate_memory_with_offload(
|
|
quant.vram_gb, context_size, offload_percent, quantization_bits
|
|
)
|
|
|
|
if vram_per_instance * MIN_INSTANCES > available_vram:
|
|
continue
|
|
|
|
if force_instances:
|
|
instances = force_instances
|
|
if not use_mlx and vram_per_instance * instances > available_vram:
|
|
continue
|
|
else:
|
|
instances = 1 if use_mlx else calculate_max_instances(available_vram, vram_per_instance)
|
|
|
|
total_memory = vram_per_instance + ram_per_instance if use_mlx else (vram_per_instance + ram_per_instance) * instances
|
|
|
|
return ModelConfig(
|
|
model=model,
|
|
variant=variant,
|
|
quantization=quant,
|
|
instances=instances,
|
|
memory_per_instance_gb=vram_per_instance + ram_per_instance,
|
|
total_memory_gb=total_memory,
|
|
context_size=context_size,
|
|
offload_percent=offload_percent,
|
|
vram_usage_gb=vram_per_instance,
|
|
ram_usage_gb=ram_per_instance
|
|
)
|
|
|
|
return None
|
|
|
|
|
|
def _try_smallest_variant_with_context(
|
|
model: Model,
|
|
available_vram: float,
|
|
force_instances: Optional[int],
|
|
context_size: int,
|
|
offload_percent: float,
|
|
use_mlx: bool = False
|
|
) -> Optional[ModelConfig]:
|
|
"""Try to fit the smallest variant with the smallest quantization."""
|
|
if not model.variants:
|
|
return None
|
|
|
|
smallest_variant = min(model.variants, key=lambda v: v.base_vram_gb)
|
|
if not smallest_variant.quantizations:
|
|
return None
|
|
|
|
smallest_quant = min(smallest_variant.quantizations, key=lambda q: q.vram_gb)
|
|
|
|
quantization_bits = 4 if 'q4' in smallest_quant.name else (5 if 'q5' in smallest_quant.name else 6)
|
|
vram_per_instance, ram_per_instance = calculate_memory_with_offload(
|
|
smallest_quant.vram_gb, context_size, offload_percent, quantization_bits
|
|
)
|
|
|
|
if vram_per_instance > available_vram:
|
|
return None
|
|
|
|
instances = force_instances or (1 if use_mlx else calculate_max_instances(available_vram, vram_per_instance))
|
|
instances = max(instances, 1)
|
|
|
|
total_memory = vram_per_instance + ram_per_instance if use_mlx else (vram_per_instance + ram_per_instance) * instances
|
|
|
|
return ModelConfig(
|
|
model=model,
|
|
variant=smallest_variant,
|
|
quantization=smallest_quant,
|
|
instances=instances,
|
|
memory_per_instance_gb=vram_per_instance + ram_per_instance,
|
|
total_memory_gb=total_memory,
|
|
context_size=context_size,
|
|
offload_percent=offload_percent,
|
|
vram_usage_gb=vram_per_instance,
|
|
ram_usage_gb=ram_per_instance
|
|
)
|
|
|
|
|
|
def format_recommendation(config: ModelConfig, hardware: HardwareProfile) -> str:
|
|
"""Format a human-readable recommendation."""
|
|
lines = [
|
|
f"Model: {config.display_name}",
|
|
f"Context Window: {config.context_size//1000}K tokens",
|
|
f"Instances: {config.instances}",
|
|
f"GPU VRAM per instance: {config.vram_usage_gb:.1f} GB",
|
|
]
|
|
|
|
if config.offload_percent > 0:
|
|
lines.append(f"System RAM per instance: {config.ram_usage_gb:.1f} GB")
|
|
|
|
lines.extend([
|
|
f"Total memory used: {config.total_memory_gb:.1f} GB",
|
|
f"Available memory: {hardware.available_memory_gb:.1f} GB",
|
|
])
|
|
|
|
if config.instances == OPTIMAL_MAX_INSTANCES:
|
|
lines.append(f"Note: Using optimal instance count (3-5 = 85-90% consensus benefit)")
|
|
elif config.instances == MIN_INSTANCES:
|
|
lines.append(f"Note: Minimum instances for consensus voting")
|
|
elif config.instances < OPTIMAL_MAX_INSTANCES:
|
|
lines.append(f"Note: Limited by available memory")
|
|
else:
|
|
lines.append(f"Note: Maximum instances (diminishing returns beyond 5)")
|
|
|
|
if hardware.gpu:
|
|
if hardware.is_apple_silicon:
|
|
lines.append(f"Hardware: Apple Silicon ({hardware.gpu.name})")
|
|
else:
|
|
lines.append(f"GPU: {hardware.gpu.name} ({hardware.gpu.vram_gb:.1f} GB)")
|
|
else:
|
|
lines.append("Hardware: CPU-only mode")
|
|
|
|
return "\n".join(lines)
|