server: Add cached_tokens info to oaicompat responses (#19361)
* tests : fix fetch_server_test_models.py * server: to_json_oaicompat cached_tokens Adds OpenAI and Anthropic compatible information about the number of cached prompt tokens used in a response.
This commit is contained in:
@@ -51,6 +51,27 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
|
||||
assert choice["finish_reason"] == finish_reason
|
||||
|
||||
|
||||
def test_chat_completion_cached_tokens():
|
||||
global server
|
||||
server.n_slots = 1
|
||||
server.start()
|
||||
seq = [
|
||||
("1 2 3 4 5 6", 77, 0),
|
||||
("1 2 3 4 5 6", 77, 76),
|
||||
("1 2 3 4 5 9", 77, 51),
|
||||
("1 2 3 9 9 9", 77, 47),
|
||||
]
|
||||
for user_prompt, n_prompt, n_cache in seq:
|
||||
res = server.make_request("POST", "/chat/completions", data={
|
||||
"max_tokens": 8,
|
||||
"messages": [
|
||||
{"role": "system", "content": "Test"},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
})
|
||||
assert res.body["usage"]["prompt_tokens"] == n_prompt
|
||||
assert res.body["usage"]["prompt_tokens_details"]["cached_tokens"] == n_cache
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
|
||||
[
|
||||
|
||||
@@ -63,8 +63,10 @@ def test_anthropic_messages_basic():
|
||||
assert "text" in res.body["content"][0], "Text content block missing 'text' field"
|
||||
assert res.body["stop_reason"] in ["end_turn", "max_tokens"], f"Invalid stop_reason: {res.body.get('stop_reason')}"
|
||||
assert "usage" in res.body, "Missing 'usage' field"
|
||||
assert "cache_read_input_tokens" in res.body["usage"], "Missing usage.cache_read_input_tokens"
|
||||
assert "input_tokens" in res.body["usage"], "Missing usage.input_tokens"
|
||||
assert "output_tokens" in res.body["usage"], "Missing usage.output_tokens"
|
||||
assert isinstance(res.body["usage"]["cache_read_input_tokens"], int), "cache_read_input_tokens should be integer"
|
||||
assert isinstance(res.body["usage"]["input_tokens"], int), "input_tokens should be integer"
|
||||
assert isinstance(res.body["usage"]["output_tokens"], int), "output_tokens should be integer"
|
||||
assert res.body["usage"]["output_tokens"] > 0, "Should have generated some tokens"
|
||||
|
||||
Reference in New Issue
Block a user