server: introduce API for serving / loading / unloading multiple models (#17470)

* server: add model management and proxy

* fix compile error

* does this fix windows?

* fix windows build

* use subprocess.h, better logging

* add test

* fix windows

* feat: Model/Router server architecture WIP

* more stable

* fix unsafe pointer

* also allow terminate loading model

* add is_active()

* refactor: Architecture improvements

* tmp apply upstream fix

* address most problems

* address thread safety issue

* address review comment

* add docs (first version)

* address review comment

* feat: Improved UX for model information, modality interactions etc

* chore: update webui build output

* refactor: Use only the message data `model` property for displaying model used info

* chore: update webui build output

* add --models-dir param

* feat: New Model Selection UX WIP

* chore: update webui build output

* feat: Add auto-mic setting

* feat: Attachments UX improvements

* implement LRU

* remove default model path

* better --models-dir

* add env for args

* address review comments

* fix compile

* refactor: Chat Form Submit component

* ad endpoint docs

* Merge remote-tracking branch 'webui/allozaur/server_model_management_v1_2' into xsn/server_model_maagement_v1_2

Co-authored-by: Aleksander <aleksander.grygier@gmail.com>

* feat: Add copy to clipboard to model name in model info dialog

* feat: Model unavailable UI state for model selector

* feat: Chat Form Actions UI logic improvements

* feat: Auto-select model from last assistant response

* chore: update webui build output

* expose args and exit_code in API

* add note

* support extra_args on loading model

* allow reusing args if auto_load

* typo docs

* oai-compat /models endpoint

* cleaner

* address review comments

* feat: Use `model` property for displaying the `repo/model-name` naming format

* refactor: Attachments data

* chore: update webui build output

* refactor: Enum imports

* feat: Improve Model Selector responsiveness

* chore: update webui build output

* refactor: Cleanup

* refactor: Cleanup

* refactor: Formatters

* chore: update webui build output

* refactor: Copy To Clipboard Icon component

* chore: update webui build output

* refactor: Cleanup

* chore: update webui build output

* refactor: UI badges

* chore: update webui build output

* refactor: Cleanup

* refactor: Cleanup

* chore: update webui build output

* add --models-allow-extra-args for security

* nits

* add stdin_file

* fix merge

* fix: Retrieve lost setting after resolving merge conflict

* refactor: DatabaseStore -> DatabaseService

* refactor: Database, Conversations & Chat services + stores architecture improvements (WIP)

* refactor: Remove redundant settings

* refactor: Multi-model business logic WIP

* chore: update webui build output

* feat: Switching models logic for ChatForm or when regenerating messges + modality detection logic

* chore: update webui build output

* fix: Add `untrack` inside chat processing info data logic to prevent infinite effect

* fix: Regenerate

* feat: Remove redundant settigns + rearrange

* fix: Audio attachments

* refactor: Icons

* chore: update webui build output

* feat: Model management and selection features WIP

* chore: update webui build output

* refactor: Improve server properties management

* refactor: Icons

* chore: update webui build output

* feat: Improve model loading/unloading status updates

* chore: update webui build output

* refactor: Improve API header management via utility functions

* remove support for extra args

* set hf_repo/docker_repo as model alias when posible

* refactor: Remove ConversationsService

* refactor: Chat requests abort handling

* refactor: Server store

* tmp webui build

* refactor: Model modality handling

* chore: update webui build output

* refactor: Processing state reactivity

* fix: UI

* refactor: Services/Stores syntax + logic improvements

Refactors components to access stores directly instead of using exported getter functions.

This change centralizes store access and logic, simplifying component code and improving maintainability by reducing the number of exported functions and promoting direct store interaction.

Removes exported getter functions from `chat.svelte.ts`, `conversations.svelte.ts`, `models.svelte.ts` and `settings.svelte.ts`.

* refactor: Architecture cleanup

* feat: Improve statistic badges

* feat: Condition available models based on modality + better model loading strategy & UX

* docs: Architecture documentation

* feat: Update logic for PDF as Image

* add TODO for http client

* refactor: Enhance model info and attachment handling

* chore: update webui build output

* refactor: Components naming

* chore: update webui build output

* refactor: Cleanup

* refactor: DRY `getAttachmentDisplayItems` function + fix UI

* chore: update webui build output

* fix: Modality detection improvement for text-based PDF attachments

* refactor: Cleanup

* docs: Add info comment

* refactor: Cleanup

* re

* refactor: Cleanup

* refactor: Cleanup

* feat: Attachment logic & UI improvements

* refactor: Constants

* feat: Improve UI sidebar background color

* chore: update webui build output

* refactor: Utils imports + move types to `app.d.ts`

* test: Fix Storybook mocks

* chore: update webui build output

* test: Update Chat Form UI tests

* refactor: Tooltip Provider from core layout

* refactor: Tests to separate location

* decouple server_models from server_routes

* test: Move demo test  to tests/server

* refactor: Remove redundant method

* chore: update webui build output

* also route anthropic endpoints

* fix duplicated arg

* fix invalid ptr to shutdown_handler

* server : minor

* rm unused fn

* add ?autoload=true|false query param

* refactor: Remove redundant code

* docs: Update README documentations + architecture & data flow diagrams

* fix: Disable autoload on calling server props for the model

* chore: update webui build output

* fix ubuntu build

* fix: Model status reactivity

* fix: Modality detection for MODEL mode

* chore: update webui build output

---------

Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Xuan-Son Nguyen
2025-12-01 19:41:04 +01:00
committed by GitHub
parent 7733409734
commit ec18edfcba
178 changed files with 11643 additions and 4356 deletions
@@ -1,331 +1,136 @@
import { browser } from '$app/environment';
import { SERVER_PROPS_LOCALSTORAGE_KEY } from '$lib/constants/localstorage-keys';
import { ChatService } from '$lib/services/chat';
import { config } from '$lib/stores/settings.svelte';
import { PropsService } from '$lib/services/props';
import { ServerRole } from '$lib/enums';
/**
* ServerStore - Server state management and capability detection
* serverStore - Server connection state, configuration, and role detection
*
* This store manages communication with the llama.cpp server to retrieve and maintain
* server properties, model information, and capability detection. It provides reactive
* state for server connectivity, model capabilities, and endpoint availability.
* This store manages the server connection state and properties fetched from `/props`.
* It provides reactive state for server configuration and role detection.
*
* **Architecture & Relationships:**
* - **ServerStore** (this class): Server state and capability management
* - Fetches and caches server properties from `/props` endpoint
* - Detects model capabilities (vision, audio support)
* - Tests endpoint availability (slots endpoint)
* - Provides reactive server state for UI components
*
* - **ChatService**: Uses server properties for request validation
* - **SlotsService**: Depends on slots endpoint availability detection
* - **UI Components**: Subscribe to server state for capability-based rendering
* - **PropsService**: Stateless service for fetching `/props` data
* - **serverStore** (this class): Reactive store for server state
* - **modelsStore**: Independent store for model management (uses PropsService directly)
*
* **Key Features:**
* - **Server Properties**: Model path, context size, build information
* - **Capability Detection**: Vision and audio modality support
* - **Endpoint Testing**: Slots endpoint availability checking
* - **Error Handling**: User-friendly error messages for connection issues
* - **Reactive State**: Svelte 5 runes for automatic UI updates
* - **State Management**: Loading states and error recovery
*
* **Server Capabilities Detected:**
* - Model name extraction from file path
* - Vision support (multimodal image processing)
* - Audio support (speech processing)
* - Slots endpoint availability (for processing state monitoring)
* - Context window size and token limits
* - **Server State**: Connection status, loading, error handling
* - **Role Detection**: MODEL (single model) vs ROUTER (multi-model)
* - **Default Params**: Server-wide generation defaults
*/
class ServerStore {
constructor() {
if (!browser) return;
// ─────────────────────────────────────────────────────────────────────────────
// State
// ─────────────────────────────────────────────────────────────────────────────
const cachedProps = this.readCachedServerProps();
if (cachedProps) {
this._serverProps = cachedProps;
}
props = $state<ApiLlamaCppServerProps | null>(null);
loading = $state(false);
error = $state<string | null>(null);
role = $state<ServerRole | null>(null);
private fetchPromise: Promise<void> | null = null;
// ─────────────────────────────────────────────────────────────────────────────
// Getters
// ─────────────────────────────────────────────────────────────────────────────
get defaultParams(): ApiLlamaCppServerProps['default_generation_settings']['params'] | null {
return this.props?.default_generation_settings?.params || null;
}
private _serverProps = $state<ApiLlamaCppServerProps | null>(null);
private _loading = $state(false);
private _error = $state<string | null>(null);
private _serverWarning = $state<string | null>(null);
private _slotsEndpointAvailable = $state<boolean | null>(null);
private fetchServerPropsPromise: Promise<void> | null = null;
private readCachedServerProps(): ApiLlamaCppServerProps | null {
if (!browser) return null;
try {
const raw = localStorage.getItem(SERVER_PROPS_LOCALSTORAGE_KEY);
if (!raw) return null;
return JSON.parse(raw) as ApiLlamaCppServerProps;
} catch (error) {
console.warn('Failed to read cached server props from localStorage:', error);
return null;
}
get contextSize(): number | null {
return this.props?.default_generation_settings?.n_ctx ?? null;
}
private persistServerProps(props: ApiLlamaCppServerProps | null): void {
if (!browser) return;
try {
if (props) {
localStorage.setItem(SERVER_PROPS_LOCALSTORAGE_KEY, JSON.stringify(props));
} else {
localStorage.removeItem(SERVER_PROPS_LOCALSTORAGE_KEY);
}
} catch (error) {
console.warn('Failed to persist server props to localStorage:', error);
}
get isRouterMode(): boolean {
return this.role === ServerRole.ROUTER;
}
get serverProps(): ApiLlamaCppServerProps | null {
return this._serverProps;
get isModelMode(): boolean {
return this.role === ServerRole.MODEL;
}
get loading(): boolean {
return this._loading;
}
// ─────────────────────────────────────────────────────────────────────────────
// Data Handling
// ─────────────────────────────────────────────────────────────────────────────
get error(): string | null {
return this._error;
}
async fetch(): Promise<void> {
if (this.fetchPromise) return this.fetchPromise;
get serverWarning(): string | null {
return this._serverWarning;
}
get modelName(): string | null {
if (this._serverProps?.model_alias) {
return this._serverProps.model_alias;
}
if (!this._serverProps?.model_path) return null;
return this._serverProps.model_path.split(/(\\|\/)/).pop() || null;
}
get supportedModalities(): string[] {
const modalities: string[] = [];
if (this._serverProps?.modalities?.audio) {
modalities.push('audio');
}
if (this._serverProps?.modalities?.vision) {
modalities.push('vision');
}
return modalities;
}
get supportsVision(): boolean {
return this._serverProps?.modalities?.vision ?? false;
}
get supportsAudio(): boolean {
return this._serverProps?.modalities?.audio ?? false;
}
get slotsEndpointAvailable(): boolean | null {
return this._slotsEndpointAvailable;
}
get serverDefaultParams():
| ApiLlamaCppServerProps['default_generation_settings']['params']
| null {
return this._serverProps?.default_generation_settings?.params || null;
}
/**
* Check if slots endpoint is available based on server properties and endpoint support
*/
private async checkSlotsEndpointAvailability(): Promise<void> {
if (!this._serverProps) {
this._slotsEndpointAvailable = false;
return;
}
if (this._serverProps.total_slots <= 0) {
this._slotsEndpointAvailable = false;
return;
}
try {
const currentConfig = config();
const apiKey = currentConfig.apiKey?.toString().trim();
const response = await fetch(`./slots`, {
headers: {
...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {})
}
});
if (response.status === 501) {
console.info('Slots endpoint not implemented - server started without --slots flag');
this._slotsEndpointAvailable = false;
return;
}
this._slotsEndpointAvailable = true;
} catch (error) {
console.warn('Unable to test slots endpoint availability:', error);
this._slotsEndpointAvailable = false;
}
}
/**
* Fetches server properties from the server
*/
async fetchServerProps(options: { silent?: boolean } = {}): Promise<void> {
const { silent = false } = options;
const isSilent = silent && this._serverProps !== null;
if (this.fetchServerPropsPromise) {
return this.fetchServerPropsPromise;
}
if (!isSilent) {
this._loading = true;
this._error = null;
this._serverWarning = null;
}
const hadProps = this._serverProps !== null;
this.loading = true;
this.error = null;
const fetchPromise = (async () => {
try {
const props = await ChatService.getServerProps();
this._serverProps = props;
this.persistServerProps(props);
this._error = null;
this._serverWarning = null;
await this.checkSlotsEndpointAvailability();
const props = await PropsService.fetch();
this.props = props;
this.error = null;
this.detectRole(props);
} catch (error) {
if (isSilent && hadProps) {
console.warn('Silent server props refresh failed, keeping cached data:', error);
return;
}
this.handleFetchServerPropsError(error, hadProps);
this.error = this.getErrorMessage(error);
console.error('Error fetching server properties:', error);
} finally {
if (!isSilent) {
this._loading = false;
}
this.fetchServerPropsPromise = null;
this.loading = false;
this.fetchPromise = null;
}
})();
this.fetchServerPropsPromise = fetchPromise;
this.fetchPromise = fetchPromise;
await fetchPromise;
}
/**
* Handles fetch failures by attempting to recover cached server props and
* updating the user-facing error or warning state appropriately.
*/
private handleFetchServerPropsError(error: unknown, hadProps: boolean): void {
const { errorMessage, isOfflineLikeError, isServerSideError } = this.normalizeFetchError(error);
let cachedProps: ApiLlamaCppServerProps | null = null;
if (!hadProps) {
cachedProps = this.readCachedServerProps();
if (cachedProps) {
this._serverProps = cachedProps;
this._error = null;
if (isOfflineLikeError || isServerSideError) {
this._serverWarning = errorMessage;
}
console.warn(
'Failed to refresh server properties, using cached values from localStorage:',
errorMessage
);
} else {
this._error = errorMessage;
}
} else {
this._error = null;
if (isOfflineLikeError || isServerSideError) {
this._serverWarning = errorMessage;
}
console.warn(
'Failed to refresh server properties, continuing with cached values:',
errorMessage
);
}
console.error('Error fetching server properties:', error);
}
private normalizeFetchError(error: unknown): {
errorMessage: string;
isOfflineLikeError: boolean;
isServerSideError: boolean;
} {
let errorMessage = 'Failed to connect to server';
let isOfflineLikeError = false;
let isServerSideError = false;
private getErrorMessage(error: unknown): string {
if (error instanceof Error) {
const message = error.message || '';
if (error.name === 'TypeError' && message.includes('fetch')) {
errorMessage = 'Server is not running or unreachable';
isOfflineLikeError = true;
return 'Server is not running or unreachable';
} else if (message.includes('ECONNREFUSED')) {
errorMessage = 'Connection refused - server may be offline';
isOfflineLikeError = true;
return 'Connection refused - server may be offline';
} else if (message.includes('ENOTFOUND')) {
errorMessage = 'Server not found - check server address';
isOfflineLikeError = true;
return 'Server not found - check server address';
} else if (message.includes('ETIMEDOUT')) {
errorMessage = 'Request timed out - the server took too long to respond';
isOfflineLikeError = true;
return 'Request timed out';
} else if (message.includes('503')) {
errorMessage = 'Server temporarily unavailable - try again shortly';
isServerSideError = true;
return 'Server temporarily unavailable';
} else if (message.includes('500')) {
errorMessage = 'Server error - check server logs';
isServerSideError = true;
return 'Server error - check server logs';
} else if (message.includes('404')) {
errorMessage = 'Server endpoint not found';
return 'Server endpoint not found';
} else if (message.includes('403') || message.includes('401')) {
errorMessage = 'Access denied';
return 'Access denied';
}
}
return { errorMessage, isOfflineLikeError, isServerSideError };
return 'Failed to connect to server';
}
/**
* Clears the server state
*/
clear(): void {
this._serverProps = null;
this._error = null;
this._serverWarning = null;
this._loading = false;
this._slotsEndpointAvailable = null;
this.fetchServerPropsPromise = null;
this.persistServerProps(null);
this.props = null;
this.error = null;
this.loading = false;
this.role = null;
this.fetchPromise = null;
}
// ─────────────────────────────────────────────────────────────────────────────
// Utilities
// ─────────────────────────────────────────────────────────────────────────────
private detectRole(props: ApiLlamaCppServerProps): void {
const newRole = props?.role === ServerRole.ROUTER ? ServerRole.ROUTER : ServerRole.MODEL;
if (this.role !== newRole) {
this.role = newRole;
console.info(`Server running in ${newRole === ServerRole.ROUTER ? 'ROUTER' : 'MODEL'} mode`);
}
}
}
export const serverStore = new ServerStore();
export const serverProps = () => serverStore.serverProps;
export const serverProps = () => serverStore.props;
export const serverLoading = () => serverStore.loading;
export const serverError = () => serverStore.error;
export const serverWarning = () => serverStore.serverWarning;
export const modelName = () => serverStore.modelName;
export const supportedModalities = () => serverStore.supportedModalities;
export const supportsVision = () => serverStore.supportsVision;
export const supportsAudio = () => serverStore.supportsAudio;
export const slotsEndpointAvailable = () => serverStore.slotsEndpointAvailable;
export const serverDefaultParams = () => serverStore.serverDefaultParams;
export const serverRole = () => serverStore.role;
export const defaultParams = () => serverStore.defaultParams;
export const contextSize = () => serverStore.contextSize;
export const isRouterMode = () => serverStore.isRouterMode;
export const isModelMode = () => serverStore.isModelMode;