Files
sleepy_agent_ios/SleepyAgent/Inference/LlmEngine.swift
T
sleepy 45d43f2645 Add Objective-C++ bridge for LiteRT-LM integration
- LlmEngineBridge.h/.mm: Objective-C++ wrapper around LiteRT-LM C++ API
- SleepyAgent-Bridging-Header.h: Swift bridging header
- Updated LlmEngine.swift to use the bridge
- Added LITERT_INTEGRATION.md with detailed research findings

Based on analysis of Google's litert-samples repository:
- Google uses C++ bridge pattern for iOS (confirmed in image_segmentation example)
- MediaPipe has working Swift API but is deprecated
- LiteRT-LM Swift APIs are 'coming soon'

The bridge pattern matches how Google AI Edge Gallery iOS app is likely implemented
2026-04-06 14:54:06 +02:00

284 lines
9.2 KiB
Swift

import Foundation
import UIKit
// MARK: - Errors
public enum LlmEngineError: LocalizedError {
case modelNotFound(path: String)
case modelNotLoaded
case conversationClosed
case generationFailed(underlying: Error)
case invalidMultimodalInput
case engineInitializationFailed(underlying: Error)
case notImplemented
public var errorDescription: String? {
switch self {
case .modelNotFound(let path):
return "Model file not found at: \(path)"
case .modelNotLoaded:
return "No model is currently loaded"
case .conversationClosed:
return "Conversation has been closed"
case .generationFailed(let error):
return "Generation failed: \(error.localizedDescription)"
case .invalidMultimodalInput:
return "Invalid multimodal input provided"
case .engineInitializationFailed(let error):
return "Failed to initialize engine: \(error.localizedDescription)"
case .notImplemented:
return "This feature requires LiteRT-LM C++ integration"
}
}
}
// MARK: - Conversation
public final class Conversation: @unchecked Sendable {
public var isAlive: Bool = true
internal var messageHistory: [(role: String, content: String)] = []
internal init() {}
public func close() {
isAlive = false
}
deinit {
close()
}
}
// MARK: - LlmEngine Protocol
public protocol LlmEngine: Actor {
var isLoaded: Bool { get }
func loadModel(path: String) async throws
func createConversation(systemPrompt: String) throws -> Conversation
func generate(
conversation: Conversation,
prompt: String,
audioData: Data?,
images: [UIImage]?
) async throws -> String
func generateStream(
conversation: Conversation,
prompt: String,
audioData: Data?,
images: [UIImage]?
) -> AsyncThrowingStream<String, Error>
func unload()
}
// MARK: - LiteRT-LM Engine Implementation
/// LiteRT-LM based LLM Engine using Objective-C++ bridge
///
/// Architecture:
/// - Swift LlmEngine (this file) -> Obj-C++ LlmEngineBridge -> C++ LiteRT-LM
///
/// This approach is necessary because:
/// 1. LiteRT-LM Swift APIs are "coming soon" (as of 2025)
/// 2. Google's own apps use C++ bridge pattern (verified in litert-samples)
///
public actor LiteRtLlmEngine: LlmEngine {
public static let shared = LiteRtLlmEngine()
public private(set) var isLoaded: Bool = false
private var currentModelPath: String?
private var currentConversation: Conversation?
private var systemPrompt: String = ""
// Objective-C++ bridge instance
private var bridge: LlmEngineBridge?
private let maxTokens = 16384
private init() {}
// MARK: - Model Loading
public func loadModel(path: String) async throws {
unload()
guard FileManager.default.fileExists(atPath: path) else {
throw LlmEngineError.modelNotFound(path: path)
}
// Initialize the Objective-C++ bridge
var error: NSError?
let newBridge = LlmEngineBridge(
modelPath: path,
accelerator: .cpu, // Can use .metal for GPU acceleration
error: &error
)
if let error = error {
throw LlmEngineError.engineInitializationFailed(underlying: error)
}
guard let bridge = newBridge else {
throw LlmEngineError.engineInitializationFailed(
underlying: NSError(domain: "LlmEngine", code: -1, userInfo: [NSLocalizedDescriptionKey: "Failed to create bridge"])
)
}
self.bridge = bridge
self.isLoaded = true
self.currentModelPath = path
print("[LiteRtLlmEngine] Model loaded: \(path)")
}
// MARK: - Conversation Management
public func createConversation(systemPrompt: String) throws -> Conversation {
guard isLoaded else {
throw LlmEngineError.modelNotLoaded
}
self.systemPrompt = systemPrompt
let conversation = Conversation()
conversation.messageHistory.append(("system", systemPrompt))
// Clear any existing history in the bridge
bridge?.clearHistory()
// Add system prompt to bridge
bridge?.add(toHistory: systemPrompt, role: "system")
self.currentConversation = conversation
return conversation
}
// MARK: - Generation
public func generate(
conversation: Conversation,
prompt: String,
audioData: Data? = nil,
images: [UIImage]? = nil
) async throws -> String {
guard conversation.isAlive else {
throw LlmEngineError.conversationClosed
}
guard isLoaded, let bridge = bridge else {
throw LlmEngineError.modelNotLoaded
}
// TODO: Handle multimodal inputs (images, audio)
// For now, focus on text-only generation
if audioData != nil || !(images?.isEmpty ?? true) {
// Multimodal not yet implemented in bridge
throw LlmEngineError.notImplemented
}
// Add user message to history
conversation.messageHistory.append(("user", prompt))
bridge.add(toHistory: prompt, role: "user")
// Generate response
var error: NSError?
let response = bridge.generateResponse(prompt, error: &error)
if let error = error {
throw LlmEngineError.generationFailed(underlying: error)
}
guard let text = response else {
throw LlmEngineError.generationFailed(
underlying: NSError(domain: "LlmEngine", code: -1, userInfo: [NSLocalizedDescriptionKey: "Empty response"])
)
}
// Add assistant response to history
conversation.messageHistory.append(("assistant", text))
bridge.add(toHistory: text, role: "assistant")
return text
}
public func generateStream(
conversation: Conversation,
prompt: String,
audioData: Data? = nil,
images: [UIImage]? = nil
) -> AsyncThrowingStream<String, Error> {
AsyncThrowingStream { continuation in
Task {
do {
guard conversation.isAlive else {
throw LlmEngineError.conversationClosed
}
guard self.isLoaded, let bridge = self.bridge else {
throw LlmEngineError.modelNotLoaded
}
// Handle multimodal (not implemented)
if audioData != nil || !(images?.isEmpty ?? true) {
throw LlmEngineError.notImplemented
}
// Add user message to history
conversation.messageHistory.append(("user", prompt))
bridge.add(toHistory: prompt, role: "user")
// Get streaming response from bridge
var error: NSError?
guard let stream = bridge.generateResponseStream(prompt, error: &error) else {
if let error = error {
throw LlmEngineError.generationFailed(underlying: error)
} else {
throw LlmEngineError.generationFailed(
underlying: NSError(domain: "LlmEngine", code: -1, userInfo: [NSLocalizedDescriptionKey: "Failed to create stream"])
)
}
}
// Read chunks from stream
var fullResponse = ""
while stream.hasMore {
if let chunk = stream.nextChunk() {
continuation.yield(chunk)
fullResponse.append(chunk)
}
// Small delay to prevent blocking
try await Task.sleep(nanoseconds: 1_000_000) // 1ms
}
// Close stream
stream.close()
// Add complete response to history
conversation.messageHistory.append(("assistant", fullResponse))
bridge.add(toHistory: fullResponse, role: "assistant")
continuation.finish()
} catch {
continuation.finish(throwing: error)
}
}
}
}
// MARK: - Utility
public func unload() {
bridge?.close()
bridge = nil
isLoaded = false
currentModelPath = nil
currentConversation = nil
print("[LiteRtLlmEngine] Unloaded")
}
public func estimateTokens(text: String) -> Int {
return bridge?.estimateTokens(text) ?? (text.count / 4)
}
}