sleepy_agent_ios/SleepyAgent/Inference/LlmEngine.swift

import Foundation
import UIKit

// MARK: - Errors

public enum LlmEngineError: LocalizedError {
    case modelNotFound(path: String)
    case modelNotLoaded
    case conversationClosed
    case generationFailed(underlying: Error)
    case invalidMultimodalInput
    case engineInitializationFailed(underlying: Error)
    case notImplemented

    public var errorDescription: String? {
        switch self {
        case .modelNotFound(let path):
            return "Model file not found at: \(path)"
        case .modelNotLoaded:
            return "No model is currently loaded"
        case .conversationClosed:
            return "Conversation has been closed"
        case .generationFailed(let error):
            return "Generation failed: \(error.localizedDescription)"
        case .invalidMultimodalInput:
            return "Invalid multimodal input provided"
        case .engineInitializationFailed(let error):
            return "Failed to initialize engine: \(error.localizedDescription)"
        case .notImplemented:
            return "This feature requires LiteRT-LM C++ integration"
        }
    }
}

// MARK: - Conversation

public final class Conversation: @unchecked Sendable {
    public var isAlive: Bool = true
    internal var messageHistory: [(role: String, content: String)] = []

    internal init() {}

    public func close() {
        isAlive = false
    }

    deinit {
        close()
    }
}

// MARK: - LlmEngine Protocol

public protocol LlmEngine: Actor {
    var isLoaded: Bool { get }

    func loadModel(path: String) async throws
    func createConversation(systemPrompt: String) throws -> Conversation
    func generate(
        conversation: Conversation,
        prompt: String,
        audioData: Data?,
        images: [UIImage]?
    ) async throws -> String
    func generateStream(
        conversation: Conversation,
        prompt: String,
        audioData: Data?,
        images: [UIImage]?
    ) -> AsyncThrowingStream<String, Error>
    func unload()
}

// MARK: - LiteRT-LM Engine Implementation

/// LiteRT-LM based LLM Engine using Objective-C++ bridge
///
/// Architecture:
/// - Swift LlmEngine (this file) -> Obj-C++ LlmEngineBridge -> C++ LiteRT-LM
///
/// This approach is necessary because:
/// 1. LiteRT-LM Swift APIs are "coming soon" (as of 2025)
/// 2. Google's own apps use C++ bridge pattern (verified in litert-samples)
///
public actor LiteRtLlmEngine: LlmEngine {
    public static let shared = LiteRtLlmEngine()

    public private(set) var isLoaded: Bool = false
    private var currentModelPath: String?
    private var currentConversation: Conversation?
    private var systemPrompt: String = ""

    // Objective-C++ bridge instance
    private var bridge: LlmEngineBridge?

    private let maxTokens = 16384

    private init() {}

    // MARK: - Model Loading

    public func loadModel(path: String) async throws {
        unload()

        guard FileManager.default.fileExists(atPath: path) else {
            throw LlmEngineError.modelNotFound(path: path)
        }

        // Initialize the Objective-C++ bridge
        var error: NSError?
        let newBridge = LlmEngineBridge(
            modelPath: path,
            accelerator: .cpu, // Can use .metal for GPU acceleration
            error: &error
        )

        if let error = error {
            throw LlmEngineError.engineInitializationFailed(underlying: error)
        }

        guard let bridge = newBridge else {
            throw LlmEngineError.engineInitializationFailed(
                underlying: NSError(domain: "LlmEngine", code: -1, userInfo: [NSLocalizedDescriptionKey: "Failed to create bridge"])
            )
        }

        self.bridge = bridge
        self.isLoaded = true
        self.currentModelPath = path

        print("[LiteRtLlmEngine] Model loaded: \(path)")
    }

    // MARK: - Conversation Management

    public func createConversation(systemPrompt: String) throws -> Conversation {
        guard isLoaded else {
            throw LlmEngineError.modelNotLoaded
        }

        self.systemPrompt = systemPrompt

        let conversation = Conversation()
        conversation.messageHistory.append(("system", systemPrompt))

        // Clear any existing history in the bridge
        bridge?.clearHistory()

        // Add system prompt to bridge
        bridge?.add(toHistory: systemPrompt, role: "system")

        self.currentConversation = conversation
        return conversation
    }

    // MARK: - Generation

    public func generate(
        conversation: Conversation,
        prompt: String,
        audioData: Data? = nil,
        images: [UIImage]? = nil
    ) async throws -> String {
        guard conversation.isAlive else {
            throw LlmEngineError.conversationClosed
        }

        guard isLoaded, let bridge = bridge else {
            throw LlmEngineError.modelNotLoaded
        }

        // TODO: Handle multimodal inputs (images, audio)
        // For now, focus on text-only generation
        if audioData != nil || !(images?.isEmpty ?? true) {
            // Multimodal not yet implemented in bridge
            throw LlmEngineError.notImplemented
        }

        // Add user message to history
        conversation.messageHistory.append(("user", prompt))
        bridge.add(toHistory: prompt, role: "user")

        // Generate response
        var error: NSError?
        let response = bridge.generateResponse(prompt, error: &error)

        if let error = error {
            throw LlmEngineError.generationFailed(underlying: error)
        }

        guard let text = response else {
            throw LlmEngineError.generationFailed(
                underlying: NSError(domain: "LlmEngine", code: -1, userInfo: [NSLocalizedDescriptionKey: "Empty response"])
            )
        }

        // Add assistant response to history
        conversation.messageHistory.append(("assistant", text))
        bridge.add(toHistory: text, role: "assistant")

        return text
    }

    public func generateStream(
        conversation: Conversation,
        prompt: String,
        audioData: Data? = nil,
        images: [UIImage]? = nil
    ) -> AsyncThrowingStream<String, Error> {
        AsyncThrowingStream { continuation in
            Task {
                do {
                    guard conversation.isAlive else {
                        throw LlmEngineError.conversationClosed
                    }

                    guard self.isLoaded, let bridge = self.bridge else {
                        throw LlmEngineError.modelNotLoaded
                    }

                    // Handle multimodal (not implemented)
                    if audioData != nil || !(images?.isEmpty ?? true) {
                        throw LlmEngineError.notImplemented
                    }

                    // Add user message to history
                    conversation.messageHistory.append(("user", prompt))
                    bridge.add(toHistory: prompt, role: "user")

                    // Get streaming response from bridge
                    var error: NSError?
                    guard let stream = bridge.generateResponseStream(prompt, error: &error) else {
                        if let error = error {
                            throw LlmEngineError.generationFailed(underlying: error)
                        } else {
                            throw LlmEngineError.generationFailed(
                                underlying: NSError(domain: "LlmEngine", code: -1, userInfo: [NSLocalizedDescriptionKey: "Failed to create stream"])
                            )
                        }
                    }

                    // Read chunks from stream
                    var fullResponse = ""
                    while stream.hasMore {
                        if let chunk = stream.nextChunk() {
                            continuation.yield(chunk)
                            fullResponse.append(chunk)
                        }
                        // Small delay to prevent blocking
                        try await Task.sleep(nanoseconds: 1_000_000) // 1ms
                    }

                    // Close stream
                    stream.close()

                    // Add complete response to history
                    conversation.messageHistory.append(("assistant", fullResponse))
                    bridge.add(toHistory: fullResponse, role: "assistant")

                    continuation.finish()

                } catch {
                    continuation.finish(throwing: error)
                }
            }
        }
    }

    // MARK: - Utility

    public func unload() {
        bridge?.close()
        bridge = nil
        isLoaded = false
        currentModelPath = nil
        currentConversation = nil
        print("[LiteRtLlmEngine] Unloaded")
    }

    public func estimateTokens(text: String) -> Int {
        return bridge?.estimateTokens(text) ?? (text.count / 4)
    }
}