server: Add cached_tokens info to oaicompat responses (#19361)

* tests : fix fetch_server_test_models.py

* server: to_json_oaicompat cached_tokens

Adds OpenAI and Anthropic compatible information about the
number of cached prompt tokens used in a response.
This commit is contained in:
Ryan Goulden
2026-03-19 11:09:33 -07:00
committed by GitHub
parent 76f2dc70c3
commit 26c9ce1288
6 changed files with 61 additions and 31 deletions
+4
View File
@@ -344,6 +344,7 @@ struct server_task_result_cmpl_final : server_task_result {
bool truncated;
int32_t n_decoded;
int32_t n_prompt_tokens;
int32_t n_prompt_tokens_cache;
int32_t n_tokens_cached;
bool has_new_line;
std::string stopping_word;
@@ -387,6 +388,8 @@ struct server_task_result_cmpl_final : server_task_result {
json to_json_non_oaicompat();
json usage_json_oaicompat();
json to_json_oaicompat();
json to_json_oaicompat_chat();
@@ -408,6 +411,7 @@ struct server_task_result_cmpl_partial : server_task_result {
int32_t n_decoded;
int32_t n_prompt_tokens;
int32_t n_prompt_tokens_cache;
bool post_sampling_probs;
bool is_progress = false;