webui: Add option to pre-encode conversation for faster next turns (#21034)

This commit is contained in:
Aleksander Grygier
2026-04-09 09:10:18 +02:00
committed by GitHub
parent b54cb2e3d0
commit 75511a8d7e
7 changed files with 267 additions and 81 deletions
@@ -4,7 +4,8 @@ import { isAbortError } from '$lib/utils/abort';
import {
ATTACHMENT_LABEL_PDF_FILE,
ATTACHMENT_LABEL_MCP_PROMPT,
ATTACHMENT_LABEL_MCP_RESOURCE
ATTACHMENT_LABEL_MCP_RESOURCE,
LEGACY_AGENTIC_REGEX
} from '$lib/constants';
import {
AttachmentType,
@@ -279,6 +280,107 @@ export class ChatService {
}
}
/**
* Checks whether all server slots are currently idle (not processing any requests).
* Queries the /slots endpoint (requires --slots flag on the server).
* Returns true if all slots are idle, false if any is processing.
* If the endpoint is unavailable or errors out, returns true (best-effort fallback).
*
* @param signal - Optional AbortSignal to cancel the request if needed
* @param model - Optional model name to check slots for (required in ROUTER mode)
* @returns {Promise<boolean>} Promise that resolves to true if all slots are idle, false if any is processing
*/
static async areAllSlotsIdle(model?: string | null, signal?: AbortSignal): Promise<boolean> {
try {
const url = model ? `./slots?model=${encodeURIComponent(model)}` : './slots';
const res = await fetch(url, { signal });
if (!res.ok) return true;
const slots: { is_processing: boolean }[] = await res.json();
return slots.every((s) => !s.is_processing);
} catch {
return true;
}
}
/**
* Sends a fire-and-forget request to pre-encode the conversation in the server's KV cache.
* After a response completes, this re-submits the full conversation
* using n_predict=0 and stream=false so the server processes the prompt without generating tokens.
* This warms the cache for the next turn, making it faster.
*
* When excludeReasoningFromContext is true, reasoning content is stripped from the messages
* to match what sendMessage would send on the next turn (avoiding cache misses).
* When false, reasoning_content is preserved so the cached prompt matches the next request.
*
* @param messages - The full conversation including the latest assistant response
* @param model - Optional model name (required in ROUTER mode)
* @param excludeReasoning - Whether to strip reasoning content (should match excludeReasoningFromContext setting)
* @param signal - Optional AbortSignal to cancel the pre-encode request
*/
static async preEncode(
messages: ApiChatMessageData[] | (DatabaseMessage & { extra?: DatabaseMessageExtra[] })[],
model?: string | null,
excludeReasoning?: boolean,
signal?: AbortSignal
): Promise<void> {
const normalizedMessages: ApiChatMessageData[] = messages
.map((msg) => {
if ('id' in msg && 'convId' in msg && 'timestamp' in msg) {
return ChatService.convertDbMessageToApiChatMessageData(
msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] }
);
}
return msg as ApiChatMessageData;
})
.filter((msg) => {
if (msg.role === MessageRole.SYSTEM) {
const content = typeof msg.content === 'string' ? msg.content : '';
return content.trim().length > 0;
}
return true;
});
const requestBody: Record<string, unknown> = {
messages: normalizedMessages.map((msg: ApiChatMessageData) => {
const mapped: Record<string, unknown> = {
role: msg.role,
content: excludeReasoning ? ChatService.stripReasoningContent(msg.content) : msg.content,
tool_calls: msg.tool_calls,
tool_call_id: msg.tool_call_id
};
if (!excludeReasoning && msg.reasoning_content) {
mapped.reasoning_content = msg.reasoning_content;
}
return mapped;
}),
stream: false,
n_predict: 0
};
if (model) {
requestBody.model = model;
}
try {
await fetch(`./v1/chat/completions`, {
method: 'POST',
headers: getJsonHeaders(),
body: JSON.stringify(requestBody),
signal
});
} catch (error) {
if (!isAbortError(error)) {
console.warn('[ChatService] Pre-encode request failed:', error);
}
}
}
/**
*
*
@@ -799,6 +901,28 @@ export class ChatService {
*
*/
/**
* Strips legacy inline reasoning content tags from message content.
* Handles both plain string content and multipart content arrays.
*/
private static stripReasoningContent(
content: string | ApiChatMessageContentPart[]
): string | ApiChatMessageContentPart[] {
const stripFromString = (text: string): string =>
text.replace(LEGACY_AGENTIC_REGEX.REASONING_BLOCK, '').trim();
if (typeof content === 'string') {
return stripFromString(content);
}
return content.map((part) => {
if (part.type === ContentPartType.TEXT && part.text) {
return { ...part, text: stripFromString(part.text) };
}
return part;
});
}
/**
* Parses error response and creates appropriate error with context information
* @param response - HTTP response object