sleepy_agent_ios/SleepyAgent/Inference/Bridge/LlmEngineBridge.h

//
//  LlmEngineBridge.h
//  SleepyAgent
//
//  Objective-C bridge to LiteRT-LM C++ API
//

#import <Foundation/Foundation.h>

NS_ASSUME_NONNULL_BEGIN

/// Error domain for LiteRT-LM bridge errors
extern NSString *const kLiteRTLMErrorDomain;

/// Accelerator options for model inference
typedef NS_ENUM(NSInteger, LiteRTAccelerator) {
    LiteRTAcceleratorCPU = 0,
    LiteRTAcceleratorMetal = 1,
    LiteRTAcceleratorCoreML = 2
};

/// Represents a streaming response from the LLM
@interface LiteRTResponseStream : NSObject

/// Get the next chunk of the response (blocking)
/// Returns nil when stream is complete
- (nullable NSString *)nextChunk;

/// Check if the stream has more data
@property (nonatomic, readonly) BOOL hasMore;

/// Close the stream and release resources
- (void)close;

@end

/// Bridge class for LiteRT-LM LLM inference
/// Wraps the C++ LiteRT-LM API for use in Swift
@interface LlmEngineBridge : NSObject

- (instancetype)init NS_UNAVAILABLE;

/// Initialize the LLM engine with a model file
/// @param modelPath Path to the .litertlm model file
/// @param accelerator Hardware accelerator to use (CPU, Metal, CoreML)
/// @param error Error pointer for initialization failures
/// @return Initialized engine bridge or nil on error
- (nullable instancetype)initWithModelPath:(NSString *)modelPath
                               accelerator:(LiteRTAccelerator)accelerator
                                     error:(NSError **)error NS_DESIGNATED_INITIALIZER;

/// Generate a response for a single prompt (non-streaming)
/// @param prompt The user's input text
/// @param error Error pointer for generation failures
/// @return The generated response or nil on error
- (nullable NSString *)generateResponse:(NSString *)prompt
                                  error:(NSError **)error;

/// Generate a streaming response
/// @param prompt The user's input text
/// @param error Error pointer for generation failures
/// @return A stream object to read response chunks
- (nullable LiteRTResponseStream *)generateResponseStream:(NSString *)prompt
                                                    error:(NSError **)error;

/// Add a message to the conversation history
/// This maintains context for multi-turn conversations (KV cache)
/// @param message The message text
/// @param role The role ("user" or "assistant")
- (void)addToHistory:(NSString *)message
                role:(NSString *)role;

/// Clear the conversation history and reset KV cache
- (void)clearHistory;

/// Check if the engine is initialized and ready
@property (nonatomic, readonly) BOOL isReady;

/// Get the maximum number of tokens the model supports
@property (nonatomic, readonly) NSInteger maxTokens;

/// Estimate the number of tokens in a string
/// @param text The text to measure
/// @return Token count or -1 if estimation fails
- (NSInteger)estimateTokens:(NSString *)text;

/// Close the engine and release all resources
- (void)close;

@end

NS_ASSUME_NONNULL_END