fix: Remove slow HF API check from recommended config selection

- select_optimal_model was checking HF API for available quantizations - This caused menu to hang/slow down when changing context - Now only checks availability when browsing or custom config - Recommended config uses default quantizations (faster)
2026-02-23 23:54:57 +01:00
parent f2d0fddfa4
commit 2461f45ca8
2 changed files with 6 additions and 5 deletions
@@ -64,3 +64,5 @@
 {"t":"reindex","f":"src/interactive.py","s":0}
 {"t":"watch","files":33}
 {"t":"reindex","f":"src/models/selector.py","s":0}
+{"t":"watch","files":33}
+{"t":"reindex","f":"src/models/selector.py","s":0}
@@ -215,15 +215,14 @@ def select_optimal_model(
    available_vram, available_ram = get_available_memory_with_offload(hardware, offload_percent)
    
    # Get models to try (with appropriate quantizations)
-    # On Mac, check which quantizations are actually available
-    check_available = use_mlx
-    
+    # Note: Don't check available quantizations here (too slow for menu rendering)
+    # Only check when user is actually browsing or selecting custom config
    if preferred_model:
        from models.registry import get_model
-        preferred = get_model(preferred_model, use_mlx=use_mlx, check_available=check_available)
+        preferred = get_model(preferred_model, use_mlx=use_mlx, check_available=False)
        models = [preferred] if preferred else []
    else:
-        models = list_models(use_mlx=use_mlx, check_available=check_available)
+        models = list_models(use_mlx=use_mlx, check_available=False)
    
    # Note: On Apple Silicon with MLX, multiple instances work fine in sequential mode
    # The swarm manager will handle sequential execution to avoid GPU conflicts