webui: Add option to pre-encode conversation for faster next turns (#21034)
This commit is contained in:
committed by
GitHub
parent
b54cb2e3d0
commit
75511a8d7e
@@ -4,7 +4,8 @@ import { isAbortError } from '$lib/utils/abort';
|
||||
import {
|
||||
ATTACHMENT_LABEL_PDF_FILE,
|
||||
ATTACHMENT_LABEL_MCP_PROMPT,
|
||||
ATTACHMENT_LABEL_MCP_RESOURCE
|
||||
ATTACHMENT_LABEL_MCP_RESOURCE,
|
||||
LEGACY_AGENTIC_REGEX
|
||||
} from '$lib/constants';
|
||||
import {
|
||||
AttachmentType,
|
||||
@@ -279,6 +280,107 @@ export class ChatService {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether all server slots are currently idle (not processing any requests).
|
||||
* Queries the /slots endpoint (requires --slots flag on the server).
|
||||
* Returns true if all slots are idle, false if any is processing.
|
||||
* If the endpoint is unavailable or errors out, returns true (best-effort fallback).
|
||||
*
|
||||
* @param signal - Optional AbortSignal to cancel the request if needed
|
||||
* @param model - Optional model name to check slots for (required in ROUTER mode)
|
||||
* @returns {Promise<boolean>} Promise that resolves to true if all slots are idle, false if any is processing
|
||||
*/
|
||||
static async areAllSlotsIdle(model?: string | null, signal?: AbortSignal): Promise<boolean> {
|
||||
try {
|
||||
const url = model ? `./slots?model=${encodeURIComponent(model)}` : './slots';
|
||||
const res = await fetch(url, { signal });
|
||||
if (!res.ok) return true;
|
||||
|
||||
const slots: { is_processing: boolean }[] = await res.json();
|
||||
return slots.every((s) => !s.is_processing);
|
||||
} catch {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a fire-and-forget request to pre-encode the conversation in the server's KV cache.
|
||||
* After a response completes, this re-submits the full conversation
|
||||
* using n_predict=0 and stream=false so the server processes the prompt without generating tokens.
|
||||
* This warms the cache for the next turn, making it faster.
|
||||
*
|
||||
* When excludeReasoningFromContext is true, reasoning content is stripped from the messages
|
||||
* to match what sendMessage would send on the next turn (avoiding cache misses).
|
||||
* When false, reasoning_content is preserved so the cached prompt matches the next request.
|
||||
*
|
||||
* @param messages - The full conversation including the latest assistant response
|
||||
* @param model - Optional model name (required in ROUTER mode)
|
||||
* @param excludeReasoning - Whether to strip reasoning content (should match excludeReasoningFromContext setting)
|
||||
* @param signal - Optional AbortSignal to cancel the pre-encode request
|
||||
*/
|
||||
static async preEncode(
|
||||
messages: ApiChatMessageData[] | (DatabaseMessage & { extra?: DatabaseMessageExtra[] })[],
|
||||
model?: string | null,
|
||||
excludeReasoning?: boolean,
|
||||
signal?: AbortSignal
|
||||
): Promise<void> {
|
||||
const normalizedMessages: ApiChatMessageData[] = messages
|
||||
.map((msg) => {
|
||||
if ('id' in msg && 'convId' in msg && 'timestamp' in msg) {
|
||||
return ChatService.convertDbMessageToApiChatMessageData(
|
||||
msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] }
|
||||
);
|
||||
}
|
||||
|
||||
return msg as ApiChatMessageData;
|
||||
})
|
||||
.filter((msg) => {
|
||||
if (msg.role === MessageRole.SYSTEM) {
|
||||
const content = typeof msg.content === 'string' ? msg.content : '';
|
||||
|
||||
return content.trim().length > 0;
|
||||
}
|
||||
|
||||
return true;
|
||||
});
|
||||
|
||||
const requestBody: Record<string, unknown> = {
|
||||
messages: normalizedMessages.map((msg: ApiChatMessageData) => {
|
||||
const mapped: Record<string, unknown> = {
|
||||
role: msg.role,
|
||||
content: excludeReasoning ? ChatService.stripReasoningContent(msg.content) : msg.content,
|
||||
tool_calls: msg.tool_calls,
|
||||
tool_call_id: msg.tool_call_id
|
||||
};
|
||||
|
||||
if (!excludeReasoning && msg.reasoning_content) {
|
||||
mapped.reasoning_content = msg.reasoning_content;
|
||||
}
|
||||
|
||||
return mapped;
|
||||
}),
|
||||
stream: false,
|
||||
n_predict: 0
|
||||
};
|
||||
|
||||
if (model) {
|
||||
requestBody.model = model;
|
||||
}
|
||||
|
||||
try {
|
||||
await fetch(`./v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: getJsonHeaders(),
|
||||
body: JSON.stringify(requestBody),
|
||||
signal
|
||||
});
|
||||
} catch (error) {
|
||||
if (!isAbortError(error)) {
|
||||
console.warn('[ChatService] Pre-encode request failed:', error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
@@ -799,6 +901,28 @@ export class ChatService {
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* Strips legacy inline reasoning content tags from message content.
|
||||
* Handles both plain string content and multipart content arrays.
|
||||
*/
|
||||
private static stripReasoningContent(
|
||||
content: string | ApiChatMessageContentPart[]
|
||||
): string | ApiChatMessageContentPart[] {
|
||||
const stripFromString = (text: string): string =>
|
||||
text.replace(LEGACY_AGENTIC_REGEX.REASONING_BLOCK, '').trim();
|
||||
|
||||
if (typeof content === 'string') {
|
||||
return stripFromString(content);
|
||||
}
|
||||
|
||||
return content.map((part) => {
|
||||
if (part.type === ContentPartType.TEXT && part.text) {
|
||||
return { ...part, text: stripFromString(part.text) };
|
||||
}
|
||||
return part;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses error response and creates appropriate error with context information
|
||||
* @param response - HTTP response object
|
||||
|
||||
Reference in New Issue
Block a user