From 8bccdbbff9d0d91d54838471f6eea182b9ab1b79 Mon Sep 17 00:00:00 2001 From: "Piotr Wilkin (ilintar)" Date: Wed, 22 Apr 2026 18:10:56 +0200 Subject: [PATCH] chat: fix parallel_tool_calls default setting based on model capabilities, add tests for parallel tool calls and structured outputs (#22217) * chat: fix parallel_tool_calls default setting based on model capabilities, add tests for parallel tool calls and structured outputs * Fix ty errors. * Fix flake8 err --- scripts/server-test-parallel-tc.py | 991 +++++++++++++++++++++++++++++ scripts/server-test-structured.py | 980 ++++++++++++++++++++++++++++ tools/cli/cli.cpp | 4 +- tools/server/server-common.cpp | 4 +- 4 files changed, 1977 insertions(+), 2 deletions(-) create mode 100755 scripts/server-test-parallel-tc.py create mode 100755 scripts/server-test-structured.py diff --git a/scripts/server-test-parallel-tc.py b/scripts/server-test-parallel-tc.py new file mode 100755 index 000000000..a166c6d72 --- /dev/null +++ b/scripts/server-test-parallel-tc.py @@ -0,0 +1,991 @@ +#!/usr/bin/env python3 +""" +Test parallel tool-calling capability via chat completions endpoint. + +Only run this against models that actually support parallel tool calls — this +script does not attempt to toggle that setting on the server. Each scenario is +explicitly worded so that a capable model SHOULD emit multiple tool calls in a +single assistant turn (either the same tool N times, or several different +tools at once). + +Each test case contains: + - tools: list of tool definitions (OpenAI-compatible) + - messages: initial conversation messages + - mock_tool_responses: dict mapping tool_name -> callable(arguments) -> str (JSON) + - expected_parallel: dict describing what constitutes a successful parallel turn + {"min_parallel": int, # minimum tool_calls in one turn + "require_same_tool": Optional[str], # all parallel calls must be this tool + "require_distinct_tools": Optional[int], # >= N distinct tool names in one turn + "min_distinct_args_key": Optional[str]} # parallel calls must span this + # many distinct values of this arg key + - validate: callable(turns, all_tool_calls, final_content) -> (passed, reason) +""" + +import argparse +import json +import requests +import sys + +# --------------------------------------------------------------------------- +# Color / formatting helpers +# --------------------------------------------------------------------------- + +RESET = "\x1b[0m" +BOLD = "\x1b[1m" +DIM = "\x1b[2m" +CYAN = "\x1b[36m" +YELLOW = "\x1b[33m" +GREEN = "\x1b[32m" +RED = "\x1b[31m" +BLUE = "\x1b[34m" +WHITE = "\x1b[97m" +MAGENTA = "\x1b[35m" + + +def _print(text="", end="\n"): + sys.stdout.write(text + end) + sys.stdout.flush() + + +def print_header(title): + bar = "─" * 60 + _print(f"\n{BOLD}{CYAN}┌{bar}┐{RESET}") + _print( + f"{BOLD}{CYAN}│ {WHITE}{title}{CYAN}{' ' * max(0, 58 - len(title))}│{RESET}" + ) + _print(f"{BOLD}{CYAN}└{bar}┘{RESET}") + + +def print_turn_banner(turn_idx, n_calls): + color = MAGENTA if n_calls >= 2 else DIM + _print(f"\n {BOLD}{color}▶ turn {turn_idx} — {n_calls} tool call(s){RESET}") + + +def print_tool_call(name, args): + args_str = json.dumps(args) + _print( + f" {BOLD}{YELLOW}⚙ {name}{RESET}{DIM}({args_str}){RESET}" + ) + + +def print_tool_result(result): + preview = result[:140] + ("…" if len(result) > 140 else "") + _print(f" {DIM}{BLUE}↳ {preview}{RESET}") + + +def print_model_output(text): + sys.stdout.write(text) + sys.stdout.flush() + + +def print_pass(reason): + _print(f"\n{BOLD}{GREEN}✔ PASS{RESET} {reason}") + + +def print_fail(reason): + _print(f"\n{BOLD}{RED}✘ FAIL{RESET} {reason}") + + +def print_info(msg): + _print(f"{DIM}{msg}{RESET}") + + +def print_warn(msg): + _print(f"{BOLD}{YELLOW}⚠ {msg}{RESET}") + + +# --------------------------------------------------------------------------- +# HTTP helpers +# --------------------------------------------------------------------------- + + +def chat_completion(url, messages, tools=None, stream=False): + payload = { + "messages": messages, + "stream": stream, + "max_tokens": 4096, + } + if tools: + payload["tools"] = tools + payload["tool_choice"] = "auto" + + try: + response = requests.post(url, json=payload, stream=stream) + response.raise_for_status() + except requests.exceptions.RequestException as e: + body = e.response.content if (e.response is not None) else b"" + print_fail(f"Request error: {e} | body: {body}") + return None + + full_content = "" + reasoning_content = "" + tool_calls: list[dict] = [] + + if stream: + for line in response.iter_lines(): + if not line: + continue + decoded = line.decode("utf-8") + if not decoded.startswith("data: "): + continue + data_str = decoded[6:] + if data_str == "[DONE]": + break + try: + data = json.loads(data_str) + except json.JSONDecodeError: + continue + choices = data.get("choices", []) + if not choices: + continue + delta = choices[0].get("delta", {}) + if delta.get("reasoning_content"): + reasoning_content += delta["reasoning_content"] + if delta.get("content"): + full_content += delta["content"] + print_model_output(delta["content"]) + for tc in delta.get("tool_calls", []): + idx = tc.get("index", 0) + while len(tool_calls) <= idx: + tool_calls.append( + { + "id": "", + "type": "function", + "function": {"name": "", "arguments": ""}, + } + ) + if "id" in tc: + tool_calls[idx]["id"] += tc["id"] + if "function" in tc: + if "name" in tc["function"]: + tool_calls[idx]["function"]["name"] += tc["function"]["name"] + if "arguments" in tc["function"]: + tool_calls[idx]["function"]["arguments"] += tc["function"][ + "arguments" + ] + else: + data = response.json() + choices = data.get("choices", []) + if choices: + msg = choices[0].get("message", {}) + full_content = msg.get("content") or "" + reasoning_content = msg.get("reasoning_content") or "" + tool_calls = msg.get("tool_calls") or [] + if full_content: + print_model_output(full_content) + + result = {"content": full_content, "tool_calls": tool_calls} + if reasoning_content: + result["reasoning_content"] = reasoning_content + return result + + +def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6): + """ + Drive the multi-turn tool-call loop, but record each turn's tool calls + separately so parallelism can be validated. + + Returns (turns, all_tool_calls, final_content) where `turns` is a list + of dicts: {"index": int, "tool_calls": [...], "content": str}. + """ + msgs = list(messages) + turns: list[dict] = [] + all_tool_calls: list[dict] = [] + + for turn_idx in range(max_turns): + result = chat_completion(url, msgs, tools=tools, stream=stream) + if result is None: + return turns, all_tool_calls, None + + tcs = result.get("tool_calls") or [] + content = result.get("content") or "" + + turns.append( + {"index": turn_idx, "tool_calls": list(tcs), "content": content} + ) + + if not tcs: + if content: + _print(f"\n{DIM}{'·' * 60}{RESET}") + _print(f"{DIM} model response:{RESET}\n") + return turns, all_tool_calls, content + + print_turn_banner(turn_idx, len(tcs)) + all_tool_calls.extend(tcs) + + assistant_msg: dict = { + "role": "assistant", + "content": content, + "tool_calls": tcs, + } + reasoning = result.get("reasoning_content") + if reasoning: + assistant_msg["reasoning_content"] = reasoning + msgs.append(assistant_msg) + + for tc in tcs: + tool_name = tc["function"]["name"] + try: + args = json.loads(tc["function"]["arguments"]) + except json.JSONDecodeError: + args = {} + + print_tool_call(tool_name, args) + + mock_fn = mock_tool_responses.get(tool_name) + if mock_fn: + tool_result = mock_fn(args) + else: + tool_result = json.dumps({"error": f"Unknown tool: {tool_name}"}) + + print_tool_result(tool_result) + + msgs.append( + { + "role": "tool", + "tool_call_id": tc.get("id", ""), + "content": tool_result, + } + ) + + return turns, all_tool_calls, None + + +# --------------------------------------------------------------------------- +# Parallelism helpers +# --------------------------------------------------------------------------- + + +def _best_parallel_turn(turns): + """Return the turn (dict) with the most tool calls, or None if no tools.""" + tool_turns = [t for t in turns if t["tool_calls"]] + if not tool_turns: + return None + return max(tool_turns, key=lambda t: len(t["tool_calls"])) + + +def _distinct_tool_names(turn): + return {tc["function"]["name"] for tc in turn["tool_calls"]} + + +def _distinct_arg_values(turn, key): + values = set() + for tc in turn["tool_calls"]: + try: + args = json.loads(tc["function"]["arguments"]) + except json.JSONDecodeError: + continue + v = args.get(key) + if v is not None: + if isinstance(v, str): + values.add(v.strip().lower()) + else: + values.add(v) + return values + + +def _check_parallel(turns, expected): + """ + Check that at least one turn satisfies the parallel-call expectations. + Returns (ok, reason). + """ + best = _best_parallel_turn(turns) + if best is None: + return False, "No tool calls were made at all" + + min_parallel = expected.get("min_parallel", 2) + if len(best["tool_calls"]) < min_parallel: + by_turn = [len(t["tool_calls"]) for t in turns] + return False, ( + f"No turn had >= {min_parallel} parallel tool calls " + f"(per-turn counts: {by_turn})" + ) + + require_same = expected.get("require_same_tool") + if require_same is not None: + names = [tc["function"]["name"] for tc in best["tool_calls"]] + if any(n != require_same for n in names): + return False, ( + f"Parallel turn mixed tools; expected all {require_same!r}, got {names}" + ) + + require_distinct = expected.get("require_distinct_tools") + if require_distinct is not None: + distinct = _distinct_tool_names(best) + if len(distinct) < require_distinct: + return False, ( + f"Parallel turn had only {len(distinct)} distinct tool names " + f"({distinct}); need >= {require_distinct}" + ) + + distinct_key = expected.get("min_distinct_args_key") + distinct_count = expected.get("min_distinct_args_count", min_parallel) + if distinct_key is not None: + values = _distinct_arg_values(best, distinct_key) + if len(values) < distinct_count: + return False, ( + f"Parallel turn had only {len(values)} distinct {distinct_key!r} " + f"values ({values}); need >= {distinct_count}" + ) + + return True, ( + f"Parallel turn had {len(best['tool_calls'])} calls across " + f"{len(_distinct_tool_names(best))} distinct tool(s)" + ) + + +# --------------------------------------------------------------------------- +# Test case runner +# --------------------------------------------------------------------------- + + +def run_test(url, test_case, stream): + name = test_case["name"] + mode = f"{'stream' if stream else 'non-stream'}" + print_header(f"{name} [{mode}]") + + turns, all_tool_calls, final_content = run_agentic_loop( + url, + messages=test_case["messages"], + tools=test_case["tools"], + mock_tool_responses=test_case["mock_tool_responses"], + stream=stream, + ) + + if not turns: + print_fail("No response from server.") + return False + + parallel_ok, parallel_reason = _check_parallel(turns, test_case["expected_parallel"]) + if not parallel_ok: + print_fail(parallel_reason) + return False + + passed, reason = test_case["validate"](turns, all_tool_calls, final_content) + if passed: + print_pass(f"{parallel_reason}; {reason}") + else: + print_fail(reason) + return passed + + +# --------------------------------------------------------------------------- +# Test case definitions +# --------------------------------------------------------------------------- + +# ---- Test 1: Multi-file read (same tool, multiple distinct paths) ---- + +_FILE_TOOLS = [ + { + "type": "function", + "function": { + "name": "read_file", + "description": ( + "Read the full contents of a file from the local filesystem. " + "Call this tool in parallel when asked to read several files — " + "each path needs its own call." + ), + "parameters": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "Absolute or repo-relative path to a file", + }, + }, + "required": ["path"], + }, + }, + }, +] + +_FILE_CONTENTS = { + "config/database.yml": "host: db.internal\nport: 5432\nuser: svc_app\n", + "config/redis.yml": "host: cache.internal\nport: 6379\ndb: 0\n", + "config/queue.yml": "broker: rabbitmq.internal\nport: 5672\nvhost: prod\n", + "config/auth.yml": "provider: oidc\nissuer: https://auth.internal\n", +} + + +def _read_file_mock(args): + path = args.get("path", "") + norm = path.lstrip("./").lstrip("/") + content = _FILE_CONTENTS.get(norm) + if content is None: + for k, v in _FILE_CONTENTS.items(): + if path.endswith(k): + content = v + break + if content is None: + return json.dumps({"path": path, "error": "not found"}) + return json.dumps({"path": path, "content": content}) + + +MULTIFILE_READ_TEST = { + "name": "Parallel multi-file read (same tool, 4 distinct paths)", + "tools": _FILE_TOOLS, + "messages": [ + { + "role": "user", + "content": ( + "Please read all four of these config files so I can review them " + "together: config/database.yml, config/redis.yml, config/queue.yml, " + "and config/auth.yml. Call read_file for every path in parallel in " + "a single batch — do NOT read them one by one sequentially across " + "turns. After you have all four, give me a one-line summary of each." + ), + } + ], + "mock_tool_responses": {"read_file": _read_file_mock}, + "expected_parallel": { + "min_parallel": 4, + "require_same_tool": "read_file", + "min_distinct_args_key": "path", + "min_distinct_args_count": 4, + }, + "validate": lambda turns, tcs, content: _validate_multifile(turns, tcs, content), +} + + +def _validate_multifile(turns, tcs, content): + del turns + if not content: + return False, "No final summary produced" + return True, f"{len(tcs)} total read_file calls; content length={len(content)}" + + +# ---- Test 2: Batch TODO marking (same tool, N calls in one turn) ---- + +_TODO_TOOLS = [ + { + "type": "function", + "function": { + "name": "mark_todo_complete", + "description": ( + "Mark a single TODO item as complete by ID. When the user wants " + "several items marked at once, call this tool in parallel — " + "one call per item — rather than sequentially across turns." + ), + "parameters": { + "type": "object", + "properties": { + "todo_id": { + "type": "string", + "description": "Identifier of the TODO item", + }, + "note": { + "type": "string", + "description": "Optional completion note", + }, + }, + "required": ["todo_id"], + }, + }, + }, +] + +_TODO_DB = { + "T-101": "Draft onboarding doc", + "T-102": "Update dependency lockfile", + "T-103": "Fix flaky login test", + "T-104": "Rotate service credentials", + "T-105": "Archive Q4 reports", +} + + +def _mark_todo_mock(args): + tid = args.get("todo_id", "") + if tid in _TODO_DB: + return json.dumps({"todo_id": tid, "title": _TODO_DB[tid], "status": "done"}) + return json.dumps({"todo_id": tid, "error": "unknown id"}) + + +TODO_BATCH_TEST = { + "name": "Batch TODO completion (same tool, 5 IDs in one turn)", + "tools": _TODO_TOOLS, + "messages": [ + { + "role": "user", + "content": ( + "I finished every item on today's list. Please mark all of the " + "following TODOs as complete, in one parallel batch: T-101, T-102, " + "T-103, T-104, T-105. Don't mark them one at a time across separate " + "turns — issue all five mark_todo_complete calls at once. Afterwards " + "confirm which ones succeeded." + ), + } + ], + "mock_tool_responses": {"mark_todo_complete": _mark_todo_mock}, + "expected_parallel": { + "min_parallel": 5, + "require_same_tool": "mark_todo_complete", + "min_distinct_args_key": "todo_id", + "min_distinct_args_count": 5, + }, + "validate": lambda turns, tcs, content: _validate_todo(turns, tcs, content), +} + + +def _validate_todo(turns, tcs, content): + del turns + if not content: + return False, "No confirmation summary produced" + return True, f"{len(tcs)} total mark_todo_complete calls" + + +# ---- Test 3: Multi-city weather (same tool, N parallel locations) ---- + +_WEATHER_TOOLS = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": ( + "Fetch current weather for ONE city. When the user asks about " + "several cities, call this tool in parallel — one call per city — " + "instead of sequentially." + ), + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string", "description": "City name"}, + "units": { + "type": "string", + "enum": ["metric", "imperial"], + "default": "metric", + }, + }, + "required": ["city"], + }, + }, + }, +] + +_WEATHER_DB = { + "tokyo": {"city": "Tokyo", "temp_c": 18.4, "condition": "partly cloudy", "humidity": 64}, + "london": {"city": "London", "temp_c": 9.1, "condition": "overcast", "humidity": 81}, + "new york": {"city": "New York", "temp_c": 12.7, "condition": "clear", "humidity": 55}, + "paris": {"city": "Paris", "temp_c": 11.3, "condition": "light rain", "humidity": 78}, +} + + +def _weather_mock(args): + city = args.get("city", "").strip().lower() + if city.startswith("new york"): + city = "new york" + if city in _WEATHER_DB: + return json.dumps(_WEATHER_DB[city]) + return json.dumps({"city": args.get("city", ""), "error": "unknown city"}) + + +MULTI_WEATHER_TEST = { + "name": "Parallel multi-city weather (same tool, 4 cities)", + "tools": _WEATHER_TOOLS, + "messages": [ + { + "role": "user", + "content": ( + "I'm comparing today's weather across four cities for a travel " + "decision: Tokyo, London, New York, and Paris. Please call " + "get_weather for all four in parallel in a single turn — don't " + "fetch them one at a time. Then rank them from warmest to coolest." + ), + } + ], + "mock_tool_responses": {"get_weather": _weather_mock}, + "expected_parallel": { + "min_parallel": 4, + "require_same_tool": "get_weather", + "min_distinct_args_key": "city", + "min_distinct_args_count": 4, + }, + "validate": lambda turns, tcs, content: _validate_weather(turns, tcs, content), +} + + +def _validate_weather(turns, tcs, content): + del turns + if not content or not any( + kw in content.lower() for kw in ("warmest", "rank", "hot", "cool") + ): + return False, f"Final content missing a ranking: {content!r}" + return True, f"{len(tcs)} total get_weather calls; ranking produced" + + +# ---- Test 4: Trip planning (different tools, parallel in one turn) ---- + +_TRIP_TOOLS = [ + { + "type": "function", + "function": { + "name": "search_flights", + "description": "Search one-way flights between two airports on a given date.", + "parameters": { + "type": "object", + "properties": { + "from_airport": {"type": "string", "description": "IATA code, e.g. SFO"}, + "to_airport": {"type": "string", "description": "IATA code, e.g. JFK"}, + "date": {"type": "string", "description": "YYYY-MM-DD"}, + }, + "required": ["from_airport", "to_airport", "date"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "search_hotels", + "description": "Search hotels in a city for a date range.", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string"}, + "check_in": {"type": "string", "description": "YYYY-MM-DD"}, + "check_out": {"type": "string", "description": "YYYY-MM-DD"}, + "max_price": {"type": "integer"}, + }, + "required": ["city", "check_in", "check_out"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "search_restaurants", + "description": "Search restaurants in a city by cuisine.", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string"}, + "cuisine": {"type": "string"}, + }, + "required": ["city"], + }, + }, + }, +] + +_FLIGHTS_RESULT = { + "results": [ + {"flight": "UA 1552", "depart": "08:15", "arrive": "16:45", "price": 389}, + {"flight": "AA 20", "depart": "10:00", "arrive": "18:35", "price": 412}, + ] +} +_HOTELS_RESULT = { + "results": [ + {"name": "Midtown Grand", "nightly_rate": 245, "rating": 4.3}, + {"name": "Harbour Boutique", "nightly_rate": 312, "rating": 4.6}, + ] +} +_RESTAURANTS_RESULT = { + "results": [ + {"name": "Trattoria Nona", "cuisine": "italian", "rating": 4.5}, + {"name": "Osteria Blu", "cuisine": "italian", "rating": 4.4}, + ] +} + +TRIP_PLAN_TEST = { + "name": "Trip planning (3 different tools in parallel)", + "tools": _TRIP_TOOLS, + "messages": [ + { + "role": "user", + "content": ( + "I'm flying from SFO to JFK on 2026-06-12 and staying four nights " + "(check out 2026-06-16). I'd also like some Italian restaurant " + "suggestions in New York. Please call search_flights, search_hotels, " + "and search_restaurants in parallel — all three in a single turn, " + "since they don't depend on each other. Then give me a concise " + "travel summary." + ), + } + ], + "mock_tool_responses": { + "search_flights": lambda _: json.dumps(_FLIGHTS_RESULT), + "search_hotels": lambda _: json.dumps(_HOTELS_RESULT), + "search_restaurants": lambda _: json.dumps(_RESTAURANTS_RESULT), + }, + "expected_parallel": { + "min_parallel": 3, + "require_distinct_tools": 3, + }, + "validate": lambda turns, tcs, content: _validate_trip(turns, tcs, content), +} + + +def _validate_trip(turns, tcs, content): + del turns + names = {tc["function"]["name"] for tc in tcs} + required = {"search_flights", "search_hotels", "search_restaurants"} + missing = required - names + if missing: + return False, f"Missing tool calls: {missing}" + if not content: + return False, "No travel summary produced" + return True, f"All three tools called; summary length={len(content)}" + + +# ---- Test 5: Portfolio check (same tool, parallel tickers) ---- + +_STOCK_TOOLS = [ + { + "type": "function", + "function": { + "name": "get_stock_quote", + "description": ( + "Get the latest quote for ONE ticker. When the user asks about " + "multiple tickers, call this tool in parallel — one per symbol — " + "rather than sequentially." + ), + "parameters": { + "type": "object", + "properties": { + "symbol": {"type": "string", "description": "Ticker symbol"}, + }, + "required": ["symbol"], + }, + }, + }, +] + +_STOCK_DB = { + "AAPL": {"symbol": "AAPL", "price": 218.45, "change_pct": "+0.8%"}, + "MSFT": {"symbol": "MSFT", "price": 421.10, "change_pct": "+1.2%"}, + "GOOGL":{"symbol": "GOOGL","price": 175.22, "change_pct": "-0.3%"}, + "AMZN": {"symbol": "AMZN", "price": 189.76, "change_pct": "+0.5%"}, + "NVDA": {"symbol": "NVDA", "price": 140.88, "change_pct": "+2.4%"}, +} + + +def _stock_mock(args): + sym = args.get("symbol", "").strip().upper() + if sym in _STOCK_DB: + return json.dumps(_STOCK_DB[sym]) + return json.dumps({"symbol": sym, "error": "unknown ticker"}) + + +PORTFOLIO_TEST = { + "name": "Portfolio check (same tool, 5 tickers in parallel)", + "tools": _STOCK_TOOLS, + "messages": [ + { + "role": "user", + "content": ( + "Pull the latest quote for every ticker in my portfolio — AAPL, " + "MSFT, GOOGL, AMZN, and NVDA — in a single parallel batch. These " + "lookups are independent, so please don't chain them across turns. " + "Once you have all five, tell me which ticker had the biggest " + "percentage change today." + ), + } + ], + "mock_tool_responses": {"get_stock_quote": _stock_mock}, + "expected_parallel": { + "min_parallel": 5, + "require_same_tool": "get_stock_quote", + "min_distinct_args_key": "symbol", + "min_distinct_args_count": 5, + }, + "validate": lambda turns, tcs, content: _validate_portfolio(turns, tcs, content), +} + + +def _validate_portfolio(turns, tcs, content): + del turns + if not content or ("nvda" not in content.lower() and "NVDA" not in content): + return False, f"Expected NVDA to be identified as the biggest mover: {content!r}" + return True, f"{len(tcs)} total quotes pulled" + + +# ---- Test 6: Mixed — translate + dictionary in parallel for the same word ---- + +_LANG_TOOLS = [ + { + "type": "function", + "function": { + "name": "translate_text", + "description": "Translate a short text into a target language.", + "parameters": { + "type": "object", + "properties": { + "text": {"type": "string"}, + "target_language": {"type": "string", + "description": "ISO 639-1 language code, e.g. 'es'"}, + }, + "required": ["text", "target_language"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_definition", + "description": "Get the English dictionary definition of a word.", + "parameters": { + "type": "object", + "properties": { + "word": {"type": "string"}, + }, + "required": ["word"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_synonyms", + "description": "Get English synonyms for a word.", + "parameters": { + "type": "object", + "properties": { + "word": {"type": "string"}, + }, + "required": ["word"], + }, + }, + }, +] + + +def _translate_mock(args): + t = args.get("text", "") + lang = args.get("target_language", "") + return json.dumps({"source": t, "target_language": lang, "translation": f"[{lang}] {t}"}) + + +def _definition_mock(args): + w = args.get("word", "") + return json.dumps({ + "word": w, + "definition": f"A standard dictionary definition of {w!r}.", + }) + + +def _synonyms_mock(args): + w = args.get("word", "") + return json.dumps({ + "word": w, + "synonyms": ["synonym_a", "synonym_b", "synonym_c"], + }) + + +LANG_TOOLKIT_TEST = { + "name": "Language toolkit (translate + definition + synonyms in parallel)", + "tools": _LANG_TOOLS, + "messages": [ + { + "role": "user", + "content": ( + "For the English word 'resilient', I need three independent " + "look-ups at once: (a) translate it into Spanish, (b) fetch its " + "dictionary definition, and (c) list its synonyms. These three " + "calls don't depend on each other — please issue them in parallel " + "in a single turn. Then present the combined results as a short " + "language note." + ), + } + ], + "mock_tool_responses": { + "translate_text": _translate_mock, + "get_definition": _definition_mock, + "get_synonyms": _synonyms_mock, + }, + "expected_parallel": { + "min_parallel": 3, + "require_distinct_tools": 3, + }, + "validate": lambda turns, tcs, content: _validate_lang(turns, tcs, content), +} + + +def _validate_lang(turns, tcs, content): + del turns + names = {tc["function"]["name"] for tc in tcs} + required = {"translate_text", "get_definition", "get_synonyms"} + missing = required - names + if missing: + return False, f"Missing tool calls: {missing}" + if not content: + return False, "No language note produced" + return True, f"All three lookup tools called; note length={len(content)}" + + +# --------------------------------------------------------------------------- +# All test cases +# --------------------------------------------------------------------------- + +ALL_TEST_CASES = [ + MULTIFILE_READ_TEST, + TODO_BATCH_TEST, + MULTI_WEATHER_TEST, + TRIP_PLAN_TEST, + PORTFOLIO_TEST, + LANG_TOOLKIT_TEST, +] + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser( + description=( + "Test llama-server parallel tool-calling capability. Run this only " + "against models configured for parallel tool calls — this script " + "does not configure that itself." + ) + ) + parser.add_argument("--host", default="localhost") + parser.add_argument("--port", default=8080, type=int) + parser.add_argument( + "--no-stream", action="store_true", help="Disable streaming mode tests" + ) + parser.add_argument( + "--stream-only", action="store_true", help="Only run streaming mode tests" + ) + parser.add_argument( + "--test", + help="Run only the test whose name contains this substring (case-insensitive)", + ) + args = parser.parse_args() + + url = f"http://{args.host}:{args.port}/v1/chat/completions" + print_info(f"Testing server at {url}") + print_warn( + "This script expects the target model to emit multiple tool calls in a " + "single assistant turn. Run it only against parallel-tool-capable models." + ) + + modes: list[bool] = [] + if not args.stream_only: + modes.append(False) + if not args.no_stream: + modes.append(True) + + cases: list[dict] = ALL_TEST_CASES + if args.test: + name_filter = args.test.lower() + cases = [c for c in cases if name_filter in str(c["name"]).lower()] + if not cases: + print_fail(f"No test cases matched '{args.test}'") + sys.exit(1) + + total = 0 + passed = 0 + for stream in modes: + for case in cases: + total += 1 + if run_test(url, case, stream=stream): + passed += 1 + + color = GREEN if passed == total else RED + _print(f"\n{BOLD}{color}{'─' * 60}{RESET}") + _print(f"{BOLD}{color} Results: {passed}/{total} passed{RESET}") + _print(f"{BOLD}{color}{'─' * 60}{RESET}\n") + sys.exit(0 if passed == total else 1) + + +if __name__ == "__main__": + main() diff --git a/scripts/server-test-structured.py b/scripts/server-test-structured.py new file mode 100755 index 000000000..98ff473b9 --- /dev/null +++ b/scripts/server-test-structured.py @@ -0,0 +1,980 @@ +#!/usr/bin/env python3 +""" +Test structured output capability via chat completions endpoint. + +Each test case contains: + - response_format: OpenAI-compatible response_format specification + (json_schema only — llama.cpp does not support json_object) + - messages: initial conversation messages + - tools (optional): tool definitions (for mixed tool + structured tests) + - mock_tool_responses (optional): dict mapping tool_name -> callable(arguments) -> str (JSON) + - apply_stage: "always" to apply response_format to every request, + "after_tools" to run the tool loop plain, then request a + structured summary in a follow-up user turn. + - followup (optional, for after_tools): user message appended before the + final structured call. + - validate: callable(parsed_json, tool_calls_history, raw_content) -> (passed: bool, reason: str) +""" + +import argparse +import json +import requests +import sys +from typing import Any, cast + +# --------------------------------------------------------------------------- +# Color / formatting helpers +# --------------------------------------------------------------------------- + +RESET = "\x1b[0m" +BOLD = "\x1b[1m" +DIM = "\x1b[2m" +CYAN = "\x1b[36m" +YELLOW = "\x1b[33m" +GREEN = "\x1b[32m" +RED = "\x1b[31m" +BLUE = "\x1b[34m" +WHITE = "\x1b[97m" +MAGENTA = "\x1b[35m" + + +def _print(text="", end="\n"): + sys.stdout.write(text + end) + sys.stdout.flush() + + +def print_header(title): + bar = "─" * 60 + _print(f"\n{BOLD}{CYAN}┌{bar}┐{RESET}") + _print( + f"{BOLD}{CYAN}│ {WHITE}{title}{CYAN}{' ' * max(0, 58 - len(title))}│{RESET}" + ) + _print(f"{BOLD}{CYAN}└{bar}┘{RESET}") + + +def print_tool_call(name, args): + args_str = json.dumps(args) + _print( + f"\n {BOLD}{YELLOW}⚙ tool call{RESET} {CYAN}{name}{RESET}{DIM}({args_str}){RESET}" + ) + + +def print_tool_result(result): + preview = result[:160] + ("…" if len(result) > 160 else "") + _print(f" {DIM}{BLUE}↳ result{RESET} {DIM}{preview}{RESET}") + + +def print_model_output(text): + sys.stdout.write(text) + sys.stdout.flush() + + +def print_pass(reason): + _print(f"\n{BOLD}{GREEN}✔ PASS{RESET} {reason}") + + +def print_fail(reason): + _print(f"\n{BOLD}{RED}✘ FAIL{RESET} {reason}") + + +def print_info(msg): + _print(f"{DIM}{msg}{RESET}") + + +def print_schema_note(label, rf): + kind = rf.get("type", "?") + name = "" + if kind == "json_schema": + name = rf.get("json_schema", {}).get("name", "") + _print(f"{DIM}{MAGENTA} ⟐ response_format [{label}]: {kind}" + f"{(' / ' + name) if name else ''}{RESET}") + + +# --------------------------------------------------------------------------- +# HTTP helpers +# --------------------------------------------------------------------------- + + +def chat_completion(url, messages, tools=None, response_format=None, stream=False): + payload = { + "messages": messages, + "stream": stream, + "max_tokens": 4096, + } + if tools: + payload["tools"] = tools + payload["tool_choice"] = "auto" + if response_format is not None: + payload["response_format"] = response_format + + try: + response = requests.post(url, json=payload, stream=stream) + response.raise_for_status() + except requests.exceptions.RequestException as e: + body = e.response.content if (e.response is not None) else b"" + print_fail(f"Request error: {e} | body: {body}") + return None + + full_content = "" + reasoning_content = "" + tool_calls: list[dict] = [] + + if stream: + for line in response.iter_lines(): + if not line: + continue + decoded = line.decode("utf-8") + if not decoded.startswith("data: "): + continue + data_str = decoded[6:] + if data_str == "[DONE]": + break + try: + data = json.loads(data_str) + except json.JSONDecodeError: + continue + choices = data.get("choices", []) + if not choices: + continue + delta = choices[0].get("delta", {}) + if delta.get("reasoning_content"): + reasoning_content += delta["reasoning_content"] + if delta.get("content"): + full_content += delta["content"] + print_model_output(delta["content"]) + for tc in delta.get("tool_calls", []): + idx = tc.get("index", 0) + while len(tool_calls) <= idx: + tool_calls.append( + { + "id": "", + "type": "function", + "function": {"name": "", "arguments": ""}, + } + ) + if "id" in tc: + tool_calls[idx]["id"] += tc["id"] + if "function" in tc: + if "name" in tc["function"]: + tool_calls[idx]["function"]["name"] += tc["function"]["name"] + if "arguments" in tc["function"]: + tool_calls[idx]["function"]["arguments"] += tc["function"][ + "arguments" + ] + else: + data = response.json() + choices = data.get("choices", []) + if choices: + msg = choices[0].get("message", {}) + full_content = msg.get("content") or "" + reasoning_content = msg.get("reasoning_content") or "" + tool_calls = msg.get("tool_calls") or [] + if full_content: + print_model_output(full_content) + + result = {"content": full_content, "tool_calls": tool_calls} + if reasoning_content: + result["reasoning_content"] = reasoning_content + return result + + +def run_tool_loop( + url, messages, tools, mock_tool_responses, stream, response_format=None, + max_turns=6, +): + """ + Drive the tool-call loop. If response_format is provided it is applied to + every request. Returns (all_tool_calls, final_messages, final_content). + """ + msgs = list(messages) + all_tool_calls: list[dict] = [] + + for _ in range(max_turns): + result = chat_completion( + url, msgs, tools=tools, response_format=response_format, stream=stream + ) + if result is None: + return all_tool_calls, msgs, None + + tcs = result.get("tool_calls") or [] + content = result.get("content") or "" + + if not tcs: + if content: + _print(f"\n{DIM}{'·' * 60}{RESET}") + return all_tool_calls, msgs, content + + all_tool_calls.extend(tcs) + + assistant_msg: dict = { + "role": "assistant", + "content": content, + "tool_calls": tcs, + } + reasoning = result.get("reasoning_content") + if reasoning: + assistant_msg["reasoning_content"] = reasoning + msgs.append(assistant_msg) + + for tc in tcs: + tool_name = tc["function"]["name"] + try: + args = json.loads(tc["function"]["arguments"]) + except json.JSONDecodeError: + args = {} + + print_tool_call(tool_name, args) + + mock_fn = mock_tool_responses.get(tool_name) if mock_tool_responses else None + if mock_fn: + tool_result = mock_fn(args) + else: + tool_result = json.dumps({"error": f"Unknown tool: {tool_name}"}) + + print_tool_result(tool_result) + + msgs.append( + { + "role": "tool", + "tool_call_id": tc.get("id", ""), + "content": tool_result, + } + ) + + return all_tool_calls, msgs, None + + +# --------------------------------------------------------------------------- +# Test case runner +# --------------------------------------------------------------------------- + + +def _try_parse_json(text): + """Attempt to parse text as JSON, trimming common markdown fences.""" + if text is None: + return None + stripped = text.strip() + if stripped.startswith("```"): + lines = stripped.splitlines() + if lines and lines[0].startswith("```"): + lines = lines[1:] + if lines and lines[-1].strip().startswith("```"): + lines = lines[:-1] + stripped = "\n".join(lines).strip() + try: + return json.loads(stripped) + except json.JSONDecodeError: + return None + + +def run_test(url, test_case, stream): + name = test_case["name"] + mode = f"{'stream' if stream else 'non-stream'}" + apply_stage = test_case.get("apply_stage", "always") + print_header(f"{name} [{mode}] ({apply_stage})") + + response_format = test_case["response_format"] + print_schema_note(apply_stage, response_format) + + tools = test_case.get("tools") + mocks = test_case.get("mock_tool_responses") or {} + + all_tcs: list[dict] = [] + final_content = None + + if apply_stage == "always": + all_tcs, _msgs, final_content = run_tool_loop( + url, + messages=list(test_case["messages"]), + tools=tools, + mock_tool_responses=mocks, + stream=stream, + response_format=response_format, + ) + elif apply_stage == "after_tools": + # Phase 1: plain tool loop, no response_format applied yet. + all_tcs, msgs, interim_content = run_tool_loop( + url, + messages=list(test_case["messages"]), + tools=tools, + mock_tool_responses=mocks, + stream=stream, + response_format=None, + ) + if interim_content: + msgs.append({"role": "assistant", "content": interim_content}) + followup = test_case.get( + "followup", + "Now output the answer strictly as JSON matching the provided schema. " + "Do not include commentary.", + ) + msgs.append({"role": "user", "content": followup}) + + # Phase 2: request final structured output. Tools are not passed so the + # model focuses on producing the schema-constrained answer. + _print(f"\n{DIM}{MAGENTA} ⟐ follow-up turn with response_format applied{RESET}") + result = chat_completion( + url, msgs, tools=None, response_format=response_format, stream=stream + ) + final_content = result["content"] if result else None + else: + print_fail(f"Unknown apply_stage: {apply_stage}") + return False + + if final_content is None: + print_fail("No final content from server.") + return False + + parsed = _try_parse_json(final_content) + if parsed is None: + print_fail(f"Final content is not valid JSON: {final_content[:200]!r}") + return False + + passed, reason = test_case["validate"](parsed, all_tcs, final_content) + if passed: + print_pass(reason) + else: + print_fail(reason) + return passed + + +# --------------------------------------------------------------------------- +# Test case definitions +# --------------------------------------------------------------------------- + +# ---- Test 1: Book metadata extraction (always / json_schema) ---- + +_BOOK_SCHEMA = { + "type": "json_schema", + "json_schema": { + "name": "book_metadata", + "strict": True, + "schema": { + "type": "object", + "additionalProperties": False, + "properties": { + "title": {"type": "string"}, + "author": {"type": "string"}, + "year": {"type": "integer"}, + "genre": { + "type": "string", + "enum": [ + "fiction", + "non-fiction", + "fantasy", + "sci-fi", + "mystery", + "biography", + "history", + "other", + ], + }, + "page_count": {"type": "integer"}, + }, + "required": ["title", "author", "year", "genre", "page_count"], + }, + }, +} + +BOOK_TEST_CASE = { + "name": "Book metadata extraction (json_schema, always)", + "response_format": _BOOK_SCHEMA, + "apply_stage": "always", + "messages": [ + { + "role": "user", + "content": ( + "Extract book metadata from this description: " + "'Dune is a 1965 science fiction epic by Frank Herbert, spanning roughly " + "688 pages in its first edition, set on the desert planet Arrakis.' " + "Return the data as JSON." + ), + } + ], + "validate": lambda parsed, tcs, raw: _validate_book(parsed), +} + + +def _validate_book(parsed): + required = {"title", "author", "year", "genre", "page_count"} + missing = required - parsed.keys() + if missing: + return False, f"Missing fields: {missing}" + if not isinstance(parsed["title"], str) or not parsed["title"]: + return False, "title must be a non-empty string" + if not isinstance(parsed["author"], str) or "herbert" not in parsed["author"].lower(): + return False, f"author unexpected: {parsed['author']!r}" + if not isinstance(parsed["year"], int) or parsed["year"] != 1965: + return False, f"year should be 1965, got {parsed['year']!r}" + if parsed["genre"] not in { + "fiction", "non-fiction", "fantasy", "sci-fi", "mystery", + "biography", "history", "other", + }: + return False, f"genre not in enum: {parsed['genre']!r}" + if not isinstance(parsed["page_count"], int) or parsed["page_count"] <= 0: + return False, f"page_count should be positive int: {parsed['page_count']!r}" + return True, f"Book: {parsed['title']} ({parsed['year']}) / {parsed['genre']}" + + +# ---- Test 2: Sentiment classification (always / enum-constrained) ---- + +_SENTIMENT_SCHEMA = { + "type": "json_schema", + "json_schema": { + "name": "sentiment_analysis", + "strict": True, + "schema": { + "type": "object", + "additionalProperties": False, + "properties": { + "sentiment": { + "type": "string", + "enum": ["positive", "negative", "neutral"], + }, + "confidence": {"type": "number"}, + "keywords": { + "type": "array", + "items": {"type": "string"}, + "minItems": 1, + "maxItems": 5, + }, + }, + "required": ["sentiment", "confidence", "keywords"], + }, + }, +} + +SENTIMENT_TEST_CASE = { + "name": "Sentiment analysis with enum and array", + "response_format": _SENTIMENT_SCHEMA, + "apply_stage": "always", + "messages": [ + { + "role": "user", + "content": ( + "Analyse the sentiment of this review and return JSON with the " + "detected sentiment label, a confidence score between 0 and 1, " + "and up to five keyword strings that drove the classification:\n\n" + "'This product completely exceeded my expectations. The build " + "quality is phenomenal, it arrived a day early, and customer " + "support was delightful when I had a setup question.'" + ), + } + ], + "validate": lambda parsed, tcs, raw: _validate_sentiment(parsed), +} + + +def _validate_sentiment(parsed): + if parsed.get("sentiment") not in {"positive", "negative", "neutral"}: + return False, f"sentiment not in enum: {parsed.get('sentiment')!r}" + if parsed["sentiment"] != "positive": + return False, f"expected positive sentiment, got {parsed['sentiment']}" + conf = parsed.get("confidence") + if not isinstance(conf, (int, float)) or not (0.0 <= conf <= 1.0): + return False, f"confidence not in [0,1]: {conf!r}" + kws = parsed.get("keywords") + if not isinstance(kws, list) or not (1 <= len(kws) <= 5): + return False, f"keywords length out of range: {kws!r}" + if not all(isinstance(k, str) and k for k in kws): + return False, f"keywords must be non-empty strings: {kws!r}" + return True, f"sentiment={parsed['sentiment']} conf={conf} kws={kws}" + + +# ---- Test 3: Nested recipe schema (always) ---- + +_RECIPE_SCHEMA = { + "type": "json_schema", + "json_schema": { + "name": "recipe", + "strict": True, + "schema": { + "type": "object", + "additionalProperties": False, + "properties": { + "name": {"type": "string"}, + "servings": {"type": "integer"}, + "ingredients": { + "type": "array", + "minItems": 2, + "items": { + "type": "object", + "additionalProperties": False, + "properties": { + "item": {"type": "string"}, + "quantity": {"type": "string"}, + }, + "required": ["item", "quantity"], + }, + }, + "steps": { + "type": "array", + "minItems": 2, + "items": {"type": "string"}, + }, + "prep_time_minutes": {"type": "integer"}, + }, + "required": ["name", "servings", "ingredients", "steps", "prep_time_minutes"], + }, + }, +} + +RECIPE_TEST_CASE = { + "name": "Nested recipe with arrays of objects", + "response_format": _RECIPE_SCHEMA, + "apply_stage": "always", + "messages": [ + { + "role": "user", + "content": ( + "Give me a simple 4-serving scrambled eggs recipe as structured JSON. " + "Include the recipe name, servings, ingredients (each with item and " + "quantity), preparation steps, and total prep time in minutes." + ), + } + ], + "validate": lambda parsed, tcs, raw: _validate_recipe(parsed), +} + + +def _validate_recipe(parsed): + required = {"name", "servings", "ingredients", "steps", "prep_time_minutes"} + missing = required - parsed.keys() + if missing: + return False, f"Missing fields: {missing}" + if not isinstance(parsed["name"], str) or not parsed["name"]: + return False, "name must be a non-empty string" + if not isinstance(parsed["servings"], int) or parsed["servings"] <= 0: + return False, f"servings must be positive int: {parsed['servings']!r}" + ings = parsed["ingredients"] + if not isinstance(ings, list) or len(ings) < 2: + return False, f"ingredients must be array of >=2: got {ings!r}" + for i, ing in enumerate(ings): + if not isinstance(ing, dict): + return False, f"ingredient[{i}] is not an object: {ing!r}" + ing_d = cast(dict[str, Any], ing) + item_val = ing_d.get("item") + qty_val = ing_d.get("quantity") + if item_val is None or qty_val is None: + return False, f"ingredient[{i}] missing item/quantity: {ing!r}" + if not isinstance(item_val, str) or not isinstance(qty_val, str): + return False, f"ingredient[{i}] fields must be strings: {ing!r}" + steps = parsed["steps"] + if not isinstance(steps, list) or len(steps) < 2: + return False, f"steps must be array of >=2 strings: got {steps!r}" + if not all(isinstance(s, str) and s for s in steps): + return False, "all steps must be non-empty strings" + pt = parsed["prep_time_minutes"] + if not isinstance(pt, int) or pt <= 0: + return False, f"prep_time_minutes must be positive int: {pt!r}" + return True, f"recipe '{parsed['name']}' with {len(ings)} ingredients, {len(steps)} steps" + + +# ---- Test 4: Tool call -> structured product comparison (after_tools) ---- + +_SHOP_TOOLS = [ + { + "type": "function", + "function": { + "name": "search_products", + "description": "Search a product catalogue by keyword.", + "parameters": { + "type": "object", + "properties": { + "query": {"type": "string"}, + }, + "required": ["query"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_product_details", + "description": "Get detailed specs for a product by ID.", + "parameters": { + "type": "object", + "properties": { + "product_id": {"type": "string"}, + }, + "required": ["product_id"], + }, + }, + }, +] + +_SHOP_SEARCH_RESULT = { + "results": [ + {"product_id": "LAP-001", "title": "AeroBook 13 Pro", "price": 1399.0, "rating": 4.7}, + {"product_id": "LAP-002", "title": "QuantumSlim 14", "price": 1199.0, "rating": 4.4}, + {"product_id": "LAP-003", "title": "NimbusWork Ultra 15", "price": 999.0, "rating": 4.2}, + ], +} +_SHOP_PRODUCT_DETAILS = { + "LAP-001": { + "product_id": "LAP-001", + "title": "AeroBook 13 Pro", + "cpu": "M-series 10-core", + "ram_gb": 16, + "storage_gb": 512, + "battery_hours": 18, + "weight_kg": 1.24, + "price": 1399.0, + }, + "LAP-002": { + "product_id": "LAP-002", + "title": "QuantumSlim 14", + "cpu": "Core i7 12-core", + "ram_gb": 16, + "storage_gb": 512, + "battery_hours": 12, + "weight_kg": 1.35, + "price": 1199.0, + }, + "LAP-003": { + "product_id": "LAP-003", + "title": "NimbusWork Ultra 15", + "cpu": "Ryzen 7 8-core", + "ram_gb": 16, + "storage_gb": 1024, + "battery_hours": 10, + "weight_kg": 1.70, + "price": 999.0, + }, +} + + +def _shop_details_mock(args): + pid = args.get("product_id", "") + if pid in _SHOP_PRODUCT_DETAILS: + return json.dumps(_SHOP_PRODUCT_DETAILS[pid]) + return json.dumps({"error": f"unknown product_id: {pid}"}) + + +_SHOP_COMPARISON_SCHEMA = { + "type": "json_schema", + "json_schema": { + "name": "laptop_comparison", + "strict": True, + "schema": { + "type": "object", + "additionalProperties": False, + "properties": { + "recommendation": {"type": "string"}, + "ranked_candidates": { + "type": "array", + "minItems": 2, + "items": { + "type": "object", + "additionalProperties": False, + "properties": { + "product_id": {"type": "string"}, + "title": {"type": "string"}, + "score": {"type": "number"}, + "reason": {"type": "string"}, + }, + "required": ["product_id", "title", "score", "reason"], + }, + }, + }, + "required": ["recommendation", "ranked_candidates"], + }, + }, +} + +SHOP_COMPARISON_TEST_CASE = { + "name": "Tool calls then structured laptop comparison (after_tools)", + "response_format": _SHOP_COMPARISON_SCHEMA, + "apply_stage": "after_tools", + "tools": _SHOP_TOOLS, + "mock_tool_responses": { + "search_products": lambda _: json.dumps(_SHOP_SEARCH_RESULT), + "get_product_details": _shop_details_mock, + }, + "messages": [ + { + "role": "user", + "content": ( + "I need a lightweight laptop for travel. Please search the catalogue " + "for 'ultraportable laptop', then fetch detailed specs for at least two " + "of the top candidates. Once you've gathered the data I'll ask you to " + "produce a structured comparison." + ), + } + ], + "followup": ( + "Thanks. Now produce the final comparison strictly as JSON matching the " + "laptop_comparison schema: your single best recommendation (the product_id), " + "and a ranked_candidates array of at least two laptops, each with " + "product_id, title, a numeric score, and a short reason." + ), + "validate": lambda parsed, tcs, raw: _validate_shop_comparison(parsed, tcs), +} + + +def _validate_shop_comparison(parsed, tcs): + names = [tc["function"]["name"] for tc in tcs] + if "search_products" not in names: + return False, f"expected search_products tool call, got {names}" + if "get_product_details" not in names: + return False, f"expected get_product_details tool call, got {names}" + if "recommendation" not in parsed or not isinstance(parsed["recommendation"], str): + return False, f"recommendation missing or not a string: {parsed!r}" + cands = parsed.get("ranked_candidates") + if not isinstance(cands, list) or len(cands) < 2: + return False, f"ranked_candidates must be >=2: {cands!r}" + valid_ids = set(_SHOP_PRODUCT_DETAILS.keys()) + candidate_pids: list = [] + for i, c in enumerate(cands): + if not isinstance(c, dict): + return False, f"candidate[{i}] not an object: {c!r}" + c_d = cast(dict[str, Any], c) + pid = c_d.get("product_id") + title = c_d.get("title") + score = c_d.get("score") + reason = c_d.get("reason") + for k, v in (("product_id", pid), ("title", title), + ("score", score), ("reason", reason)): + if v is None: + return False, f"candidate[{i}] missing {k}: {c!r}" + if pid not in valid_ids: + return False, f"candidate[{i}].product_id not in catalogue: {pid!r}" + if not isinstance(score, (int, float)): + return False, f"candidate[{i}].score not numeric: {score!r}" + candidate_pids.append(pid) + recommendation = parsed["recommendation"] + if recommendation not in valid_ids and recommendation not in candidate_pids: + return False, f"recommendation {recommendation!r} not in candidates" + return True, ( + f"tools={names}; recommended={parsed['recommendation']}; " + f"{len(cands)} ranked candidates" + ) + + +# ---- Test 5: Multi-step research then structured report (after_tools) ---- + +_RESEARCH_TOOLS = [ + { + "type": "function", + "function": { + "name": "get_country_stats", + "description": "Fetch basic statistics for a country (population, GDP, capital).", + "parameters": { + "type": "object", + "properties": { + "country": {"type": "string"}, + }, + "required": ["country"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_climate_info", + "description": "Fetch climate information for a country.", + "parameters": { + "type": "object", + "properties": { + "country": {"type": "string"}, + }, + "required": ["country"], + }, + }, + }, +] + +_COUNTRY_STATS = { + "norway": { + "country": "Norway", + "capital": "Oslo", + "population": 5_480_000, + "gdp_usd_trillion": 0.48, + "currency": "NOK", + } +} +_CLIMATE_INFO = { + "norway": { + "country": "Norway", + "climate_zone": "subarctic / temperate coastal", + "avg_winter_temp_c": -4.5, + "avg_summer_temp_c": 16.0, + "annual_precipitation_mm": 1400, + } +} + + +def _country_stats_mock(args): + c = args.get("country", "").strip().lower() + if c in _COUNTRY_STATS: + return json.dumps(_COUNTRY_STATS[c]) + return json.dumps({"error": f"unknown country: {c}"}) + + +def _climate_info_mock(args): + c = args.get("country", "").strip().lower() + if c in _CLIMATE_INFO: + return json.dumps(_CLIMATE_INFO[c]) + return json.dumps({"error": f"unknown country: {c}"}) + + +_RESEARCH_REPORT_SCHEMA = { + "type": "json_schema", + "json_schema": { + "name": "country_report", + "strict": True, + "schema": { + "type": "object", + "additionalProperties": False, + "properties": { + "country": {"type": "string"}, + "capital": {"type": "string"}, + "population": {"type": "integer"}, + "climate_summary": {"type": "string"}, + "highlights": { + "type": "array", + "minItems": 2, + "maxItems": 5, + "items": {"type": "string"}, + }, + "suitable_for_tourism": {"type": "boolean"}, + }, + "required": [ + "country", "capital", "population", + "climate_summary", "highlights", "suitable_for_tourism", + ], + }, + }, +} + +COUNTRY_REPORT_TEST_CASE = { + "name": "Research pipeline then structured country report (after_tools)", + "response_format": _RESEARCH_REPORT_SCHEMA, + "apply_stage": "after_tools", + "tools": _RESEARCH_TOOLS, + "mock_tool_responses": { + "get_country_stats": _country_stats_mock, + "get_climate_info": _climate_info_mock, + }, + "messages": [ + { + "role": "user", + "content": ( + "I'm preparing a short briefing on Norway. Please call the " + "get_country_stats and get_climate_info tools to gather data " + "first. Afterwards I'll ask for a structured summary." + ), + } + ], + "followup": ( + "Based on the tool results, produce the briefing as JSON matching the " + "country_report schema. Populate every required field and provide between " + "two and five highlights." + ), + "validate": lambda parsed, tcs, raw: _validate_country_report(parsed, tcs), +} + + +def _validate_country_report(parsed, tcs): + names = [tc["function"]["name"] for tc in tcs] + for required_tool in ("get_country_stats", "get_climate_info"): + if required_tool not in names: + return False, f"missing tool call {required_tool!r}: got {names}" + required = { + "country", "capital", "population", + "climate_summary", "highlights", "suitable_for_tourism", + } + missing = required - parsed.keys() + if missing: + return False, f"missing report fields: {missing}" + if "norway" not in parsed["country"].lower(): + return False, f"country should reference Norway: {parsed['country']!r}" + if "oslo" not in parsed["capital"].lower(): + return False, f"capital should be Oslo: {parsed['capital']!r}" + if not isinstance(parsed["population"], int) or parsed["population"] < 1_000_000: + return False, f"population implausible: {parsed['population']!r}" + if not isinstance(parsed["climate_summary"], str) or not parsed["climate_summary"]: + return False, "climate_summary must be a non-empty string" + hls = parsed["highlights"] + if not isinstance(hls, list) or not (2 <= len(hls) <= 5): + return False, f"highlights length out of range: {hls!r}" + if not all(isinstance(h, str) and h for h in hls): + return False, "each highlight must be a non-empty string" + if not isinstance(parsed["suitable_for_tourism"], bool): + return False, f"suitable_for_tourism must be bool: {parsed['suitable_for_tourism']!r}" + return True, ( + f"tools={names}; report for {parsed['country']} " + f"(pop {parsed['population']}, {len(hls)} highlights)" + ) + + +# --------------------------------------------------------------------------- +# All test cases +# --------------------------------------------------------------------------- + +ALL_TEST_CASES = [ + BOOK_TEST_CASE, + SENTIMENT_TEST_CASE, + RECIPE_TEST_CASE, + SHOP_COMPARISON_TEST_CASE, + COUNTRY_REPORT_TEST_CASE, +] + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser( + description="Test llama-server structured-output capability." + ) + parser.add_argument("--host", default="localhost") + parser.add_argument("--port", default=8080, type=int) + parser.add_argument( + "--no-stream", action="store_true", help="Disable streaming mode tests" + ) + parser.add_argument( + "--stream-only", action="store_true", help="Only run streaming mode tests" + ) + parser.add_argument( + "--test", + help="Run only the test whose name contains this substring (case-insensitive)", + ) + args = parser.parse_args() + + url = f"http://{args.host}:{args.port}/v1/chat/completions" + print_info(f"Testing server at {url}") + + modes: list[bool] = [] + if not args.stream_only: + modes.append(False) + if not args.no_stream: + modes.append(True) + + cases: list[dict] = ALL_TEST_CASES + if args.test: + name_filter = args.test.lower() + cases = [c for c in cases if name_filter in str(c["name"]).lower()] + if not cases: + print_fail(f"No test cases matched '{args.test}'") + sys.exit(1) + + total = 0 + passed = 0 + for stream in modes: + for case in cases: + total += 1 + if run_test(url, case, stream=stream): + passed += 1 + + color = GREEN if passed == total else RED + _print(f"\n{BOLD}{color}{'─' * 60}{RESET}") + _print(f"{BOLD}{color} Results: {passed}/{total} passed{RESET}") + _print(f"{BOLD}{color}{'─' * 60}{RESET}\n") + sys.exit(0 if passed == total else 1) + + +if __name__ == "__main__": + main() diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 5bdb1e78f..5136e52a7 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -207,6 +207,8 @@ struct cli_context { auto meta = ctx_server.get_meta(); auto & chat_params = meta.chat_params; + auto caps = common_chat_templates_get_caps(chat_params.tmpls.get()); + common_chat_templates_inputs inputs; inputs.messages = common_chat_msgs_parse_oaicompat(messages); inputs.tools = {}; // TODO @@ -214,7 +216,7 @@ struct cli_context { inputs.json_schema = ""; // TODO inputs.grammar = ""; // TODO inputs.use_jinja = chat_params.use_jinja; - inputs.parallel_tool_calls = false; + inputs.parallel_tool_calls = caps["supports_parallel_tool_calls"]; inputs.add_generation_prompt = true; inputs.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; inputs.force_pure_content = chat_params.force_pure_content; diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 18a317e1d..ad8834e31 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1027,6 +1027,8 @@ json oaicompat_chat_params_parse( } } + auto caps = common_chat_templates_get_caps(opt.tmpls.get()); + common_chat_templates_inputs inputs; inputs.messages = common_chat_msgs_parse_oaicompat(messages); inputs.tools = common_chat_tools_parse_oaicompat(tools); @@ -1034,7 +1036,7 @@ json oaicompat_chat_params_parse( inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump(); inputs.grammar = grammar; inputs.use_jinja = opt.use_jinja; - inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false); + inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", caps["supports_parallel_tool_calls"]); inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true); inputs.reasoning_format = opt.reasoning_format; if (body.contains("reasoning_format")) {