server: Add cached_tokens info to oaicompat responses (#19361)
* tests : fix fetch_server_test_models.py * server: to_json_oaicompat cached_tokens Adds OpenAI and Anthropic compatible information about the number of cached prompt tokens used in a response.
This commit is contained in:
@@ -344,6 +344,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
bool truncated;
|
||||
int32_t n_decoded;
|
||||
int32_t n_prompt_tokens;
|
||||
int32_t n_prompt_tokens_cache;
|
||||
int32_t n_tokens_cached;
|
||||
bool has_new_line;
|
||||
std::string stopping_word;
|
||||
@@ -387,6 +388,8 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
|
||||
json to_json_non_oaicompat();
|
||||
|
||||
json usage_json_oaicompat();
|
||||
|
||||
json to_json_oaicompat();
|
||||
|
||||
json to_json_oaicompat_chat();
|
||||
@@ -408,6 +411,7 @@ struct server_task_result_cmpl_partial : server_task_result {
|
||||
|
||||
int32_t n_decoded;
|
||||
int32_t n_prompt_tokens;
|
||||
int32_t n_prompt_tokens_cache;
|
||||
|
||||
bool post_sampling_probs;
|
||||
bool is_progress = false;
|
||||
|
||||
Reference in New Issue
Block a user