455d8e4be8
* server : speculative decoding using checkpoints * server : fix draft check with checkpoints * server : rename spec vars * server : log levels * server : refactored spec logic to speculative.cpp * server : renamed spec checkpoints option * server : fix spec checkpoints, logging * speculative : checkpoints with draft model, logging * server : n_tokens_cur and create_checkpoint in draft * server : fix server_speculative_callback (slot.id) * spec : fix ngram-map/begin idx_last_check * spec : init ckpt (begin() wasn't called) * chore: update webui build output * server : restore sampler in spec checkpoint and clear mem * cont : avoid --spec-use-checkpoints argument * cont : remove server_prompt_checkpoint_with_size * spec : rename (leave_draft_state) * cont : clean-up * cont : do not ignore partial drafts even if the are short * cont : spec callback owned by session * cont : simplify * cont : avoid empty speculative session * cont : simplify * cont : simplify * cont : enable mtmd speculative decoding * cont : keep the spec sampler alive * cont : simplify * cont : fix nullptr deref + draft checkpoints * cont : remove common_speculative_accept_response * cont : remove callback * cont : simplify * cont : minor * cont : simplify * cont : fix accepted number --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
54 lines
1.9 KiB
C++
54 lines
1.9 KiB
C++
#pragma once
|
|
|
|
#include "llama.h"
|
|
#include "common.h"
|
|
|
|
struct common_speculative;
|
|
|
|
// comma separated list of all types
|
|
std::string common_speculative_type_name_str();
|
|
|
|
// convert string to type
|
|
enum common_speculative_type common_speculative_type_from_name(const std::string & name);
|
|
|
|
// convert type to string
|
|
std::string common_speculative_type_to_str(enum common_speculative_type type);
|
|
|
|
enum common_speculative_compat_type {
|
|
COMMON_SPECULATIVE_COMPAT_TYPE_NO = 0,
|
|
COMMON_SPECULATIVE_COMPAT_TYPE_FULL = 1,
|
|
COMMON_SPECULATIVE_COMPAT_TYPE_CKPT = 2,
|
|
};
|
|
|
|
// check if the llama_context is compatible for speculative decoding
|
|
// note: clears the memory of the context
|
|
common_speculative_compat_type common_speculative_is_compat(llama_context * ctx_tgt);
|
|
|
|
common_speculative * common_speculative_init(
|
|
common_params_speculative & params,
|
|
llama_context * ctx_tgt);
|
|
|
|
void common_speculative_free(common_speculative * spec);
|
|
|
|
// optionally call once at the beginning of a new generation
|
|
void common_speculative_begin(common_speculative * spec, const llama_tokens & prompt);
|
|
|
|
// sample up to n_draft tokens and add them to the batch using the draft model
|
|
llama_tokens common_speculative_draft(
|
|
common_speculative * spec,
|
|
const common_params_speculative & params,
|
|
const llama_tokens & prompt,
|
|
llama_token id_last);
|
|
|
|
// informs the speculative decoder that n_accepted tokens were accepted by the target model
|
|
void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);
|
|
|
|
// print statistics about the speculative decoding
|
|
void common_speculative_print_stats(const common_speculative * spec);
|
|
|
|
struct common_speculative_deleter {
|
|
void operator()(common_speculative * s) { common_speculative_free(s); }
|
|
};
|
|
|
|
typedef std::unique_ptr<common_speculative, common_speculative_deleter> common_speculative_ptr;
|