chat: fix parallel_tool_calls default setting based on model capabilities, add tests for parallel tool calls and structured outputs (#22217)

* chat: fix parallel_tool_calls default setting based on model capabilities, add tests for parallel tool calls and structured outputs * Fix ty errors. * Fix flake8 err
2026-04-22 18:10:56 +02:00
parent bcb5eeb645
commit 8bccdbbff9
4 changed files with 1977 additions and 2 deletions
@@ -0,0 +1,991 @@
+#!/usr/bin/env python3
+"""
+Test parallel tool-calling capability via chat completions endpoint.
+
+Only run this against models that actually support parallel tool calls — this
+script does not attempt to toggle that setting on the server. Each scenario is
+explicitly worded so that a capable model SHOULD emit multiple tool calls in a
+single assistant turn (either the same tool N times, or several different
+tools at once).
+
+Each test case contains:
+  - tools: list of tool definitions (OpenAI-compatible)
+  - messages: initial conversation messages
+  - mock_tool_responses: dict mapping tool_name -> callable(arguments) -> str (JSON)
+  - expected_parallel: dict describing what constitutes a successful parallel turn
+        {"min_parallel": int,                # minimum tool_calls in one turn
+         "require_same_tool": Optional[str], # all parallel calls must be this tool
+         "require_distinct_tools": Optional[int], # >= N distinct tool names in one turn
+         "min_distinct_args_key": Optional[str]}  # parallel calls must span this
+                                                   # many distinct values of this arg key
+  - validate: callable(turns, all_tool_calls, final_content) -> (passed, reason)
+"""
+
+import argparse
+import json
+import requests
+import sys
+
+# ---------------------------------------------------------------------------
+# Color / formatting helpers
+# ---------------------------------------------------------------------------
+
+RESET = "\x1b[0m"
+BOLD = "\x1b[1m"
+DIM = "\x1b[2m"
+CYAN = "\x1b[36m"
+YELLOW = "\x1b[33m"
+GREEN = "\x1b[32m"
+RED = "\x1b[31m"
+BLUE = "\x1b[34m"
+WHITE = "\x1b[97m"
+MAGENTA = "\x1b[35m"
+
+
+def _print(text="", end="\n"):
+    sys.stdout.write(text + end)
+    sys.stdout.flush()
+
+
+def print_header(title):
+    bar = "─" * 60
+    _print(f"\n{BOLD}{CYAN}┌{bar}┐{RESET}")
+    _print(
+        f"{BOLD}{CYAN}│  {WHITE}{title}{CYAN}{' ' * max(0, 58 - len(title))}│{RESET}"
+    )
+    _print(f"{BOLD}{CYAN}└{bar}┘{RESET}")
+
+
+def print_turn_banner(turn_idx, n_calls):
+    color = MAGENTA if n_calls >= 2 else DIM
+    _print(f"\n  {BOLD}{color}▶ turn {turn_idx} — {n_calls} tool call(s){RESET}")
+
+
+def print_tool_call(name, args):
+    args_str = json.dumps(args)
+    _print(
+        f"    {BOLD}{YELLOW}⚙ {name}{RESET}{DIM}({args_str}){RESET}"
+    )
+
+
+def print_tool_result(result):
+    preview = result[:140] + ("…" if len(result) > 140 else "")
+    _print(f"      {DIM}{BLUE}↳ {preview}{RESET}")
+
+
+def print_model_output(text):
+    sys.stdout.write(text)
+    sys.stdout.flush()
+
+
+def print_pass(reason):
+    _print(f"\n{BOLD}{GREEN}✔ PASS{RESET}  {reason}")
+
+
+def print_fail(reason):
+    _print(f"\n{BOLD}{RED}✘ FAIL{RESET}  {reason}")
+
+
+def print_info(msg):
+    _print(f"{DIM}{msg}{RESET}")
+
+
+def print_warn(msg):
+    _print(f"{BOLD}{YELLOW}⚠ {msg}{RESET}")
+
+
+# ---------------------------------------------------------------------------
+# HTTP helpers
+# ---------------------------------------------------------------------------
+
+
+def chat_completion(url, messages, tools=None, stream=False):
+    payload = {
+        "messages": messages,
+        "stream": stream,
+        "max_tokens": 4096,
+    }
+    if tools:
+        payload["tools"] = tools
+        payload["tool_choice"] = "auto"
+
+    try:
+        response = requests.post(url, json=payload, stream=stream)
+        response.raise_for_status()
+    except requests.exceptions.RequestException as e:
+        body = e.response.content if (e.response is not None) else b""
+        print_fail(f"Request error: {e} | body: {body}")
+        return None
+
+    full_content = ""
+    reasoning_content = ""
+    tool_calls: list[dict] = []
+
+    if stream:
+        for line in response.iter_lines():
+            if not line:
+                continue
+            decoded = line.decode("utf-8")
+            if not decoded.startswith("data: "):
+                continue
+            data_str = decoded[6:]
+            if data_str == "[DONE]":
+                break
+            try:
+                data = json.loads(data_str)
+            except json.JSONDecodeError:
+                continue
+            choices = data.get("choices", [])
+            if not choices:
+                continue
+            delta = choices[0].get("delta", {})
+            if delta.get("reasoning_content"):
+                reasoning_content += delta["reasoning_content"]
+            if delta.get("content"):
+                full_content += delta["content"]
+                print_model_output(delta["content"])
+            for tc in delta.get("tool_calls", []):
+                idx = tc.get("index", 0)
+                while len(tool_calls) <= idx:
+                    tool_calls.append(
+                        {
+                            "id": "",
+                            "type": "function",
+                            "function": {"name": "", "arguments": ""},
+                        }
+                    )
+                if "id" in tc:
+                    tool_calls[idx]["id"] += tc["id"]
+                if "function" in tc:
+                    if "name" in tc["function"]:
+                        tool_calls[idx]["function"]["name"] += tc["function"]["name"]
+                    if "arguments" in tc["function"]:
+                        tool_calls[idx]["function"]["arguments"] += tc["function"][
+                            "arguments"
+                        ]
+    else:
+        data = response.json()
+        choices = data.get("choices", [])
+        if choices:
+            msg = choices[0].get("message", {})
+            full_content = msg.get("content") or ""
+            reasoning_content = msg.get("reasoning_content") or ""
+            tool_calls = msg.get("tool_calls") or []
+            if full_content:
+                print_model_output(full_content)
+
+    result = {"content": full_content, "tool_calls": tool_calls}
+    if reasoning_content:
+        result["reasoning_content"] = reasoning_content
+    return result
+
+
+def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6):
+    """
+    Drive the multi-turn tool-call loop, but record each turn's tool calls
+    separately so parallelism can be validated.
+
+    Returns (turns, all_tool_calls, final_content) where `turns` is a list
+    of dicts: {"index": int, "tool_calls": [...], "content": str}.
+    """
+    msgs = list(messages)
+    turns: list[dict] = []
+    all_tool_calls: list[dict] = []
+
+    for turn_idx in range(max_turns):
+        result = chat_completion(url, msgs, tools=tools, stream=stream)
+        if result is None:
+            return turns, all_tool_calls, None
+
+        tcs = result.get("tool_calls") or []
+        content = result.get("content") or ""
+
+        turns.append(
+            {"index": turn_idx, "tool_calls": list(tcs), "content": content}
+        )
+
+        if not tcs:
+            if content:
+                _print(f"\n{DIM}{'·' * 60}{RESET}")
+                _print(f"{DIM}  model response:{RESET}\n")
+            return turns, all_tool_calls, content
+
+        print_turn_banner(turn_idx, len(tcs))
+        all_tool_calls.extend(tcs)
+
+        assistant_msg: dict = {
+            "role": "assistant",
+            "content": content,
+            "tool_calls": tcs,
+        }
+        reasoning = result.get("reasoning_content")
+        if reasoning:
+            assistant_msg["reasoning_content"] = reasoning
+        msgs.append(assistant_msg)
+
+        for tc in tcs:
+            tool_name = tc["function"]["name"]
+            try:
+                args = json.loads(tc["function"]["arguments"])
+            except json.JSONDecodeError:
+                args = {}
+
+            print_tool_call(tool_name, args)
+
+            mock_fn = mock_tool_responses.get(tool_name)
+            if mock_fn:
+                tool_result = mock_fn(args)
+            else:
+                tool_result = json.dumps({"error": f"Unknown tool: {tool_name}"})
+
+            print_tool_result(tool_result)
+
+            msgs.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": tc.get("id", ""),
+                    "content": tool_result,
+                }
+            )
+
+    return turns, all_tool_calls, None
+
+
+# ---------------------------------------------------------------------------
+# Parallelism helpers
+# ---------------------------------------------------------------------------
+
+
+def _best_parallel_turn(turns):
+    """Return the turn (dict) with the most tool calls, or None if no tools."""
+    tool_turns = [t for t in turns if t["tool_calls"]]
+    if not tool_turns:
+        return None
+    return max(tool_turns, key=lambda t: len(t["tool_calls"]))
+
+
+def _distinct_tool_names(turn):
+    return {tc["function"]["name"] for tc in turn["tool_calls"]}
+
+
+def _distinct_arg_values(turn, key):
+    values = set()
+    for tc in turn["tool_calls"]:
+        try:
+            args = json.loads(tc["function"]["arguments"])
+        except json.JSONDecodeError:
+            continue
+        v = args.get(key)
+        if v is not None:
+            if isinstance(v, str):
+                values.add(v.strip().lower())
+            else:
+                values.add(v)
+    return values
+
+
+def _check_parallel(turns, expected):
+    """
+    Check that at least one turn satisfies the parallel-call expectations.
+    Returns (ok, reason).
+    """
+    best = _best_parallel_turn(turns)
+    if best is None:
+        return False, "No tool calls were made at all"
+
+    min_parallel = expected.get("min_parallel", 2)
+    if len(best["tool_calls"]) < min_parallel:
+        by_turn = [len(t["tool_calls"]) for t in turns]
+        return False, (
+            f"No turn had >= {min_parallel} parallel tool calls "
+            f"(per-turn counts: {by_turn})"
+        )
+
+    require_same = expected.get("require_same_tool")
+    if require_same is not None:
+        names = [tc["function"]["name"] for tc in best["tool_calls"]]
+        if any(n != require_same for n in names):
+            return False, (
+                f"Parallel turn mixed tools; expected all {require_same!r}, got {names}"
+            )
+
+    require_distinct = expected.get("require_distinct_tools")
+    if require_distinct is not None:
+        distinct = _distinct_tool_names(best)
+        if len(distinct) < require_distinct:
+            return False, (
+                f"Parallel turn had only {len(distinct)} distinct tool names "
+                f"({distinct}); need >= {require_distinct}"
+            )
+
+    distinct_key = expected.get("min_distinct_args_key")
+    distinct_count = expected.get("min_distinct_args_count", min_parallel)
+    if distinct_key is not None:
+        values = _distinct_arg_values(best, distinct_key)
+        if len(values) < distinct_count:
+            return False, (
+                f"Parallel turn had only {len(values)} distinct {distinct_key!r} "
+                f"values ({values}); need >= {distinct_count}"
+            )
+
+    return True, (
+        f"Parallel turn had {len(best['tool_calls'])} calls across "
+        f"{len(_distinct_tool_names(best))} distinct tool(s)"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Test case runner
+# ---------------------------------------------------------------------------
+
+
+def run_test(url, test_case, stream):
+    name = test_case["name"]
+    mode = f"{'stream' if stream else 'non-stream'}"
+    print_header(f"{name}  [{mode}]")
+
+    turns, all_tool_calls, final_content = run_agentic_loop(
+        url,
+        messages=test_case["messages"],
+        tools=test_case["tools"],
+        mock_tool_responses=test_case["mock_tool_responses"],
+        stream=stream,
+    )
+
+    if not turns:
+        print_fail("No response from server.")
+        return False
+
+    parallel_ok, parallel_reason = _check_parallel(turns, test_case["expected_parallel"])
+    if not parallel_ok:
+        print_fail(parallel_reason)
+        return False
+
+    passed, reason = test_case["validate"](turns, all_tool_calls, final_content)
+    if passed:
+        print_pass(f"{parallel_reason}; {reason}")
+    else:
+        print_fail(reason)
+    return passed
+
+
+# ---------------------------------------------------------------------------
+# Test case definitions
+# ---------------------------------------------------------------------------
+
+# ---- Test 1: Multi-file read (same tool, multiple distinct paths) ----
+
+_FILE_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "read_file",
+            "description": (
+                "Read the full contents of a file from the local filesystem. "
+                "Call this tool in parallel when asked to read several files — "
+                "each path needs its own call."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "path": {
+                        "type": "string",
+                        "description": "Absolute or repo-relative path to a file",
+                    },
+                },
+                "required": ["path"],
+            },
+        },
+    },
+]
+
+_FILE_CONTENTS = {
+    "config/database.yml": "host: db.internal\nport: 5432\nuser: svc_app\n",
+    "config/redis.yml":    "host: cache.internal\nport: 6379\ndb: 0\n",
+    "config/queue.yml":    "broker: rabbitmq.internal\nport: 5672\nvhost: prod\n",
+    "config/auth.yml":     "provider: oidc\nissuer: https://auth.internal\n",
+}
+
+
+def _read_file_mock(args):
+    path = args.get("path", "")
+    norm = path.lstrip("./").lstrip("/")
+    content = _FILE_CONTENTS.get(norm)
+    if content is None:
+        for k, v in _FILE_CONTENTS.items():
+            if path.endswith(k):
+                content = v
+                break
+    if content is None:
+        return json.dumps({"path": path, "error": "not found"})
+    return json.dumps({"path": path, "content": content})
+
+
+MULTIFILE_READ_TEST = {
+    "name": "Parallel multi-file read (same tool, 4 distinct paths)",
+    "tools": _FILE_TOOLS,
+    "messages": [
+        {
+            "role": "user",
+            "content": (
+                "Please read all four of these config files so I can review them "
+                "together: config/database.yml, config/redis.yml, config/queue.yml, "
+                "and config/auth.yml. Call read_file for every path in parallel in "
+                "a single batch — do NOT read them one by one sequentially across "
+                "turns. After you have all four, give me a one-line summary of each."
+            ),
+        }
+    ],
+    "mock_tool_responses": {"read_file": _read_file_mock},
+    "expected_parallel": {
+        "min_parallel": 4,
+        "require_same_tool": "read_file",
+        "min_distinct_args_key": "path",
+        "min_distinct_args_count": 4,
+    },
+    "validate": lambda turns, tcs, content: _validate_multifile(turns, tcs, content),
+}
+
+
+def _validate_multifile(turns, tcs, content):
+    del turns
+    if not content:
+        return False, "No final summary produced"
+    return True, f"{len(tcs)} total read_file calls; content length={len(content)}"
+
+
+# ---- Test 2: Batch TODO marking (same tool, N calls in one turn) ----
+
+_TODO_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "mark_todo_complete",
+            "description": (
+                "Mark a single TODO item as complete by ID. When the user wants "
+                "several items marked at once, call this tool in parallel — "
+                "one call per item — rather than sequentially across turns."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "todo_id": {
+                        "type": "string",
+                        "description": "Identifier of the TODO item",
+                    },
+                    "note": {
+                        "type": "string",
+                        "description": "Optional completion note",
+                    },
+                },
+                "required": ["todo_id"],
+            },
+        },
+    },
+]
+
+_TODO_DB = {
+    "T-101": "Draft onboarding doc",
+    "T-102": "Update dependency lockfile",
+    "T-103": "Fix flaky login test",
+    "T-104": "Rotate service credentials",
+    "T-105": "Archive Q4 reports",
+}
+
+
+def _mark_todo_mock(args):
+    tid = args.get("todo_id", "")
+    if tid in _TODO_DB:
+        return json.dumps({"todo_id": tid, "title": _TODO_DB[tid], "status": "done"})
+    return json.dumps({"todo_id": tid, "error": "unknown id"})
+
+
+TODO_BATCH_TEST = {
+    "name": "Batch TODO completion (same tool, 5 IDs in one turn)",
+    "tools": _TODO_TOOLS,
+    "messages": [
+        {
+            "role": "user",
+            "content": (
+                "I finished every item on today's list. Please mark all of the "
+                "following TODOs as complete, in one parallel batch: T-101, T-102, "
+                "T-103, T-104, T-105. Don't mark them one at a time across separate "
+                "turns — issue all five mark_todo_complete calls at once. Afterwards "
+                "confirm which ones succeeded."
+            ),
+        }
+    ],
+    "mock_tool_responses": {"mark_todo_complete": _mark_todo_mock},
+    "expected_parallel": {
+        "min_parallel": 5,
+        "require_same_tool": "mark_todo_complete",
+        "min_distinct_args_key": "todo_id",
+        "min_distinct_args_count": 5,
+    },
+    "validate": lambda turns, tcs, content: _validate_todo(turns, tcs, content),
+}
+
+
+def _validate_todo(turns, tcs, content):
+    del turns
+    if not content:
+        return False, "No confirmation summary produced"
+    return True, f"{len(tcs)} total mark_todo_complete calls"
+
+
+# ---- Test 3: Multi-city weather (same tool, N parallel locations) ----
+
+_WEATHER_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description": (
+                "Fetch current weather for ONE city. When the user asks about "
+                "several cities, call this tool in parallel — one call per city — "
+                "instead of sequentially."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city":  {"type": "string", "description": "City name"},
+                    "units": {
+                        "type": "string",
+                        "enum": ["metric", "imperial"],
+                        "default": "metric",
+                    },
+                },
+                "required": ["city"],
+            },
+        },
+    },
+]
+
+_WEATHER_DB = {
+    "tokyo":  {"city": "Tokyo",  "temp_c": 18.4, "condition": "partly cloudy", "humidity": 64},
+    "london": {"city": "London", "temp_c":  9.1, "condition": "overcast",       "humidity": 81},
+    "new york": {"city": "New York", "temp_c": 12.7, "condition": "clear",      "humidity": 55},
+    "paris":  {"city": "Paris",  "temp_c": 11.3, "condition": "light rain",     "humidity": 78},
+}
+
+
+def _weather_mock(args):
+    city = args.get("city", "").strip().lower()
+    if city.startswith("new york"):
+        city = "new york"
+    if city in _WEATHER_DB:
+        return json.dumps(_WEATHER_DB[city])
+    return json.dumps({"city": args.get("city", ""), "error": "unknown city"})
+
+
+MULTI_WEATHER_TEST = {
+    "name": "Parallel multi-city weather (same tool, 4 cities)",
+    "tools": _WEATHER_TOOLS,
+    "messages": [
+        {
+            "role": "user",
+            "content": (
+                "I'm comparing today's weather across four cities for a travel "
+                "decision: Tokyo, London, New York, and Paris. Please call "
+                "get_weather for all four in parallel in a single turn — don't "
+                "fetch them one at a time. Then rank them from warmest to coolest."
+            ),
+        }
+    ],
+    "mock_tool_responses": {"get_weather": _weather_mock},
+    "expected_parallel": {
+        "min_parallel": 4,
+        "require_same_tool": "get_weather",
+        "min_distinct_args_key": "city",
+        "min_distinct_args_count": 4,
+    },
+    "validate": lambda turns, tcs, content: _validate_weather(turns, tcs, content),
+}
+
+
+def _validate_weather(turns, tcs, content):
+    del turns
+    if not content or not any(
+        kw in content.lower() for kw in ("warmest", "rank", "hot", "cool")
+    ):
+        return False, f"Final content missing a ranking: {content!r}"
+    return True, f"{len(tcs)} total get_weather calls; ranking produced"
+
+
+# ---- Test 4: Trip planning (different tools, parallel in one turn) ----
+
+_TRIP_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "search_flights",
+            "description": "Search one-way flights between two airports on a given date.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "from_airport": {"type": "string", "description": "IATA code, e.g. SFO"},
+                    "to_airport":   {"type": "string", "description": "IATA code, e.g. JFK"},
+                    "date":         {"type": "string", "description": "YYYY-MM-DD"},
+                },
+                "required": ["from_airport", "to_airport", "date"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "search_hotels",
+            "description": "Search hotels in a city for a date range.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city":       {"type": "string"},
+                    "check_in":   {"type": "string", "description": "YYYY-MM-DD"},
+                    "check_out":  {"type": "string", "description": "YYYY-MM-DD"},
+                    "max_price":  {"type": "integer"},
+                },
+                "required": ["city", "check_in", "check_out"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "search_restaurants",
+            "description": "Search restaurants in a city by cuisine.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city":    {"type": "string"},
+                    "cuisine": {"type": "string"},
+                },
+                "required": ["city"],
+            },
+        },
+    },
+]
+
+_FLIGHTS_RESULT = {
+    "results": [
+        {"flight": "UA 1552", "depart": "08:15", "arrive": "16:45", "price": 389},
+        {"flight": "AA  20",  "depart": "10:00", "arrive": "18:35", "price": 412},
+    ]
+}
+_HOTELS_RESULT = {
+    "results": [
+        {"name": "Midtown Grand",    "nightly_rate": 245, "rating": 4.3},
+        {"name": "Harbour Boutique", "nightly_rate": 312, "rating": 4.6},
+    ]
+}
+_RESTAURANTS_RESULT = {
+    "results": [
+        {"name": "Trattoria Nona", "cuisine": "italian", "rating": 4.5},
+        {"name": "Osteria Blu",    "cuisine": "italian", "rating": 4.4},
+    ]
+}
+
+TRIP_PLAN_TEST = {
+    "name": "Trip planning (3 different tools in parallel)",
+    "tools": _TRIP_TOOLS,
+    "messages": [
+        {
+            "role": "user",
+            "content": (
+                "I'm flying from SFO to JFK on 2026-06-12 and staying four nights "
+                "(check out 2026-06-16). I'd also like some Italian restaurant "
+                "suggestions in New York. Please call search_flights, search_hotels, "
+                "and search_restaurants in parallel — all three in a single turn, "
+                "since they don't depend on each other. Then give me a concise "
+                "travel summary."
+            ),
+        }
+    ],
+    "mock_tool_responses": {
+        "search_flights": lambda _: json.dumps(_FLIGHTS_RESULT),
+        "search_hotels": lambda _: json.dumps(_HOTELS_RESULT),
+        "search_restaurants": lambda _: json.dumps(_RESTAURANTS_RESULT),
+    },
+    "expected_parallel": {
+        "min_parallel": 3,
+        "require_distinct_tools": 3,
+    },
+    "validate": lambda turns, tcs, content: _validate_trip(turns, tcs, content),
+}
+
+
+def _validate_trip(turns, tcs, content):
+    del turns
+    names = {tc["function"]["name"] for tc in tcs}
+    required = {"search_flights", "search_hotels", "search_restaurants"}
+    missing = required - names
+    if missing:
+        return False, f"Missing tool calls: {missing}"
+    if not content:
+        return False, "No travel summary produced"
+    return True, f"All three tools called; summary length={len(content)}"
+
+
+# ---- Test 5: Portfolio check (same tool, parallel tickers) ----
+
+_STOCK_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_stock_quote",
+            "description": (
+                "Get the latest quote for ONE ticker. When the user asks about "
+                "multiple tickers, call this tool in parallel — one per symbol — "
+                "rather than sequentially."
+            ),
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "symbol": {"type": "string", "description": "Ticker symbol"},
+                },
+                "required": ["symbol"],
+            },
+        },
+    },
+]
+
+_STOCK_DB = {
+    "AAPL": {"symbol": "AAPL", "price": 218.45, "change_pct": "+0.8%"},
+    "MSFT": {"symbol": "MSFT", "price": 421.10, "change_pct": "+1.2%"},
+    "GOOGL":{"symbol": "GOOGL","price": 175.22, "change_pct": "-0.3%"},
+    "AMZN": {"symbol": "AMZN", "price": 189.76, "change_pct": "+0.5%"},
+    "NVDA": {"symbol": "NVDA", "price": 140.88, "change_pct": "+2.4%"},
+}
+
+
+def _stock_mock(args):
+    sym = args.get("symbol", "").strip().upper()
+    if sym in _STOCK_DB:
+        return json.dumps(_STOCK_DB[sym])
+    return json.dumps({"symbol": sym, "error": "unknown ticker"})
+
+
+PORTFOLIO_TEST = {
+    "name": "Portfolio check (same tool, 5 tickers in parallel)",
+    "tools": _STOCK_TOOLS,
+    "messages": [
+        {
+            "role": "user",
+            "content": (
+                "Pull the latest quote for every ticker in my portfolio — AAPL, "
+                "MSFT, GOOGL, AMZN, and NVDA — in a single parallel batch. These "
+                "lookups are independent, so please don't chain them across turns. "
+                "Once you have all five, tell me which ticker had the biggest "
+                "percentage change today."
+            ),
+        }
+    ],
+    "mock_tool_responses": {"get_stock_quote": _stock_mock},
+    "expected_parallel": {
+        "min_parallel": 5,
+        "require_same_tool": "get_stock_quote",
+        "min_distinct_args_key": "symbol",
+        "min_distinct_args_count": 5,
+    },
+    "validate": lambda turns, tcs, content: _validate_portfolio(turns, tcs, content),
+}
+
+
+def _validate_portfolio(turns, tcs, content):
+    del turns
+    if not content or ("nvda" not in content.lower() and "NVDA" not in content):
+        return False, f"Expected NVDA to be identified as the biggest mover: {content!r}"
+    return True, f"{len(tcs)} total quotes pulled"
+
+
+# ---- Test 6: Mixed — translate + dictionary in parallel for the same word ----
+
+_LANG_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "translate_text",
+            "description": "Translate a short text into a target language.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "text":            {"type": "string"},
+                    "target_language": {"type": "string",
+                                        "description": "ISO 639-1 language code, e.g. 'es'"},
+                },
+                "required": ["text", "target_language"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_definition",
+            "description": "Get the English dictionary definition of a word.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "word": {"type": "string"},
+                },
+                "required": ["word"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_synonyms",
+            "description": "Get English synonyms for a word.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "word": {"type": "string"},
+                },
+                "required": ["word"],
+            },
+        },
+    },
+]
+
+
+def _translate_mock(args):
+    t = args.get("text", "")
+    lang = args.get("target_language", "")
+    return json.dumps({"source": t, "target_language": lang, "translation": f"[{lang}] {t}"})
+
+
+def _definition_mock(args):
+    w = args.get("word", "")
+    return json.dumps({
+        "word": w,
+        "definition": f"A standard dictionary definition of {w!r}.",
+    })
+
+
+def _synonyms_mock(args):
+    w = args.get("word", "")
+    return json.dumps({
+        "word": w,
+        "synonyms": ["synonym_a", "synonym_b", "synonym_c"],
+    })
+
+
+LANG_TOOLKIT_TEST = {
+    "name": "Language toolkit (translate + definition + synonyms in parallel)",
+    "tools": _LANG_TOOLS,
+    "messages": [
+        {
+            "role": "user",
+            "content": (
+                "For the English word 'resilient', I need three independent "
+                "look-ups at once: (a) translate it into Spanish, (b) fetch its "
+                "dictionary definition, and (c) list its synonyms. These three "
+                "calls don't depend on each other — please issue them in parallel "
+                "in a single turn. Then present the combined results as a short "
+                "language note."
+            ),
+        }
+    ],
+    "mock_tool_responses": {
+        "translate_text":  _translate_mock,
+        "get_definition":  _definition_mock,
+        "get_synonyms":    _synonyms_mock,
+    },
+    "expected_parallel": {
+        "min_parallel": 3,
+        "require_distinct_tools": 3,
+    },
+    "validate": lambda turns, tcs, content: _validate_lang(turns, tcs, content),
+}
+
+
+def _validate_lang(turns, tcs, content):
+    del turns
+    names = {tc["function"]["name"] for tc in tcs}
+    required = {"translate_text", "get_definition", "get_synonyms"}
+    missing = required - names
+    if missing:
+        return False, f"Missing tool calls: {missing}"
+    if not content:
+        return False, "No language note produced"
+    return True, f"All three lookup tools called; note length={len(content)}"
+
+
+# ---------------------------------------------------------------------------
+# All test cases
+# ---------------------------------------------------------------------------
+
+ALL_TEST_CASES = [
+    MULTIFILE_READ_TEST,
+    TODO_BATCH_TEST,
+    MULTI_WEATHER_TEST,
+    TRIP_PLAN_TEST,
+    PORTFOLIO_TEST,
+    LANG_TOOLKIT_TEST,
+]
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=(
+            "Test llama-server parallel tool-calling capability. Run this only "
+            "against models configured for parallel tool calls — this script "
+            "does not configure that itself."
+        )
+    )
+    parser.add_argument("--host", default="localhost")
+    parser.add_argument("--port", default=8080, type=int)
+    parser.add_argument(
+        "--no-stream", action="store_true", help="Disable streaming mode tests"
+    )
+    parser.add_argument(
+        "--stream-only", action="store_true", help="Only run streaming mode tests"
+    )
+    parser.add_argument(
+        "--test",
+        help="Run only the test whose name contains this substring (case-insensitive)",
+    )
+    args = parser.parse_args()
+
+    url = f"http://{args.host}:{args.port}/v1/chat/completions"
+    print_info(f"Testing server at {url}")
+    print_warn(
+        "This script expects the target model to emit multiple tool calls in a "
+        "single assistant turn. Run it only against parallel-tool-capable models."
+    )
+
+    modes: list[bool] = []
+    if not args.stream_only:
+        modes.append(False)
+    if not args.no_stream:
+        modes.append(True)
+
+    cases: list[dict] = ALL_TEST_CASES
+    if args.test:
+        name_filter = args.test.lower()
+        cases = [c for c in cases if name_filter in str(c["name"]).lower()]
+        if not cases:
+            print_fail(f"No test cases matched '{args.test}'")
+            sys.exit(1)
+
+    total = 0
+    passed = 0
+    for stream in modes:
+        for case in cases:
+            total += 1
+            if run_test(url, case, stream=stream):
+                passed += 1
+
+    color = GREEN if passed == total else RED
+    _print(f"\n{BOLD}{color}{'─' * 60}{RESET}")
+    _print(f"{BOLD}{color}  Results: {passed}/{total} passed{RESET}")
+    _print(f"{BOLD}{color}{'─' * 60}{RESET}\n")
+    sys.exit(0 if passed == total else 1)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,980 @@
+#!/usr/bin/env python3
+"""
+Test structured output capability via chat completions endpoint.
+
+Each test case contains:
+  - response_format: OpenAI-compatible response_format specification
+                     (json_schema only — llama.cpp does not support json_object)
+  - messages: initial conversation messages
+  - tools (optional): tool definitions (for mixed tool + structured tests)
+  - mock_tool_responses (optional): dict mapping tool_name -> callable(arguments) -> str (JSON)
+  - apply_stage: "always" to apply response_format to every request,
+                 "after_tools" to run the tool loop plain, then request a
+                 structured summary in a follow-up user turn.
+  - followup (optional, for after_tools): user message appended before the
+                 final structured call.
+  - validate: callable(parsed_json, tool_calls_history, raw_content) -> (passed: bool, reason: str)
+"""
+
+import argparse
+import json
+import requests
+import sys
+from typing import Any, cast
+
+# ---------------------------------------------------------------------------
+# Color / formatting helpers
+# ---------------------------------------------------------------------------
+
+RESET = "\x1b[0m"
+BOLD = "\x1b[1m"
+DIM = "\x1b[2m"
+CYAN = "\x1b[36m"
+YELLOW = "\x1b[33m"
+GREEN = "\x1b[32m"
+RED = "\x1b[31m"
+BLUE = "\x1b[34m"
+WHITE = "\x1b[97m"
+MAGENTA = "\x1b[35m"
+
+
+def _print(text="", end="\n"):
+    sys.stdout.write(text + end)
+    sys.stdout.flush()
+
+
+def print_header(title):
+    bar = "─" * 60
+    _print(f"\n{BOLD}{CYAN}┌{bar}┐{RESET}")
+    _print(
+        f"{BOLD}{CYAN}│  {WHITE}{title}{CYAN}{' ' * max(0, 58 - len(title))}│{RESET}"
+    )
+    _print(f"{BOLD}{CYAN}└{bar}┘{RESET}")
+
+
+def print_tool_call(name, args):
+    args_str = json.dumps(args)
+    _print(
+        f"\n  {BOLD}{YELLOW}⚙ tool call{RESET}  {CYAN}{name}{RESET}{DIM}({args_str}){RESET}"
+    )
+
+
+def print_tool_result(result):
+    preview = result[:160] + ("…" if len(result) > 160 else "")
+    _print(f"  {DIM}{BLUE}↳ result{RESET}    {DIM}{preview}{RESET}")
+
+
+def print_model_output(text):
+    sys.stdout.write(text)
+    sys.stdout.flush()
+
+
+def print_pass(reason):
+    _print(f"\n{BOLD}{GREEN}✔ PASS{RESET}  {reason}")
+
+
+def print_fail(reason):
+    _print(f"\n{BOLD}{RED}✘ FAIL{RESET}  {reason}")
+
+
+def print_info(msg):
+    _print(f"{DIM}{msg}{RESET}")
+
+
+def print_schema_note(label, rf):
+    kind = rf.get("type", "?")
+    name = ""
+    if kind == "json_schema":
+        name = rf.get("json_schema", {}).get("name", "")
+    _print(f"{DIM}{MAGENTA}  ⟐ response_format [{label}]: {kind}"
+           f"{(' / ' + name) if name else ''}{RESET}")
+
+
+# ---------------------------------------------------------------------------
+# HTTP helpers
+# ---------------------------------------------------------------------------
+
+
+def chat_completion(url, messages, tools=None, response_format=None, stream=False):
+    payload = {
+        "messages": messages,
+        "stream": stream,
+        "max_tokens": 4096,
+    }
+    if tools:
+        payload["tools"] = tools
+        payload["tool_choice"] = "auto"
+    if response_format is not None:
+        payload["response_format"] = response_format
+
+    try:
+        response = requests.post(url, json=payload, stream=stream)
+        response.raise_for_status()
+    except requests.exceptions.RequestException as e:
+        body = e.response.content if (e.response is not None) else b""
+        print_fail(f"Request error: {e} | body: {body}")
+        return None
+
+    full_content = ""
+    reasoning_content = ""
+    tool_calls: list[dict] = []
+
+    if stream:
+        for line in response.iter_lines():
+            if not line:
+                continue
+            decoded = line.decode("utf-8")
+            if not decoded.startswith("data: "):
+                continue
+            data_str = decoded[6:]
+            if data_str == "[DONE]":
+                break
+            try:
+                data = json.loads(data_str)
+            except json.JSONDecodeError:
+                continue
+            choices = data.get("choices", [])
+            if not choices:
+                continue
+            delta = choices[0].get("delta", {})
+            if delta.get("reasoning_content"):
+                reasoning_content += delta["reasoning_content"]
+            if delta.get("content"):
+                full_content += delta["content"]
+                print_model_output(delta["content"])
+            for tc in delta.get("tool_calls", []):
+                idx = tc.get("index", 0)
+                while len(tool_calls) <= idx:
+                    tool_calls.append(
+                        {
+                            "id": "",
+                            "type": "function",
+                            "function": {"name": "", "arguments": ""},
+                        }
+                    )
+                if "id" in tc:
+                    tool_calls[idx]["id"] += tc["id"]
+                if "function" in tc:
+                    if "name" in tc["function"]:
+                        tool_calls[idx]["function"]["name"] += tc["function"]["name"]
+                    if "arguments" in tc["function"]:
+                        tool_calls[idx]["function"]["arguments"] += tc["function"][
+                            "arguments"
+                        ]
+    else:
+        data = response.json()
+        choices = data.get("choices", [])
+        if choices:
+            msg = choices[0].get("message", {})
+            full_content = msg.get("content") or ""
+            reasoning_content = msg.get("reasoning_content") or ""
+            tool_calls = msg.get("tool_calls") or []
+            if full_content:
+                print_model_output(full_content)
+
+    result = {"content": full_content, "tool_calls": tool_calls}
+    if reasoning_content:
+        result["reasoning_content"] = reasoning_content
+    return result
+
+
+def run_tool_loop(
+    url, messages, tools, mock_tool_responses, stream, response_format=None,
+    max_turns=6,
+):
+    """
+    Drive the tool-call loop. If response_format is provided it is applied to
+    every request. Returns (all_tool_calls, final_messages, final_content).
+    """
+    msgs = list(messages)
+    all_tool_calls: list[dict] = []
+
+    for _ in range(max_turns):
+        result = chat_completion(
+            url, msgs, tools=tools, response_format=response_format, stream=stream
+        )
+        if result is None:
+            return all_tool_calls, msgs, None
+
+        tcs = result.get("tool_calls") or []
+        content = result.get("content") or ""
+
+        if not tcs:
+            if content:
+                _print(f"\n{DIM}{'·' * 60}{RESET}")
+            return all_tool_calls, msgs, content
+
+        all_tool_calls.extend(tcs)
+
+        assistant_msg: dict = {
+            "role": "assistant",
+            "content": content,
+            "tool_calls": tcs,
+        }
+        reasoning = result.get("reasoning_content")
+        if reasoning:
+            assistant_msg["reasoning_content"] = reasoning
+        msgs.append(assistant_msg)
+
+        for tc in tcs:
+            tool_name = tc["function"]["name"]
+            try:
+                args = json.loads(tc["function"]["arguments"])
+            except json.JSONDecodeError:
+                args = {}
+
+            print_tool_call(tool_name, args)
+
+            mock_fn = mock_tool_responses.get(tool_name) if mock_tool_responses else None
+            if mock_fn:
+                tool_result = mock_fn(args)
+            else:
+                tool_result = json.dumps({"error": f"Unknown tool: {tool_name}"})
+
+            print_tool_result(tool_result)
+
+            msgs.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": tc.get("id", ""),
+                    "content": tool_result,
+                }
+            )
+
+    return all_tool_calls, msgs, None
+
+
+# ---------------------------------------------------------------------------
+# Test case runner
+# ---------------------------------------------------------------------------
+
+
+def _try_parse_json(text):
+    """Attempt to parse text as JSON, trimming common markdown fences."""
+    if text is None:
+        return None
+    stripped = text.strip()
+    if stripped.startswith("```"):
+        lines = stripped.splitlines()
+        if lines and lines[0].startswith("```"):
+            lines = lines[1:]
+        if lines and lines[-1].strip().startswith("```"):
+            lines = lines[:-1]
+        stripped = "\n".join(lines).strip()
+    try:
+        return json.loads(stripped)
+    except json.JSONDecodeError:
+        return None
+
+
+def run_test(url, test_case, stream):
+    name = test_case["name"]
+    mode = f"{'stream' if stream else 'non-stream'}"
+    apply_stage = test_case.get("apply_stage", "always")
+    print_header(f"{name}  [{mode}] ({apply_stage})")
+
+    response_format = test_case["response_format"]
+    print_schema_note(apply_stage, response_format)
+
+    tools = test_case.get("tools")
+    mocks = test_case.get("mock_tool_responses") or {}
+
+    all_tcs: list[dict] = []
+    final_content = None
+
+    if apply_stage == "always":
+        all_tcs, _msgs, final_content = run_tool_loop(
+            url,
+            messages=list(test_case["messages"]),
+            tools=tools,
+            mock_tool_responses=mocks,
+            stream=stream,
+            response_format=response_format,
+        )
+    elif apply_stage == "after_tools":
+        # Phase 1: plain tool loop, no response_format applied yet.
+        all_tcs, msgs, interim_content = run_tool_loop(
+            url,
+            messages=list(test_case["messages"]),
+            tools=tools,
+            mock_tool_responses=mocks,
+            stream=stream,
+            response_format=None,
+        )
+        if interim_content:
+            msgs.append({"role": "assistant", "content": interim_content})
+        followup = test_case.get(
+            "followup",
+            "Now output the answer strictly as JSON matching the provided schema. "
+            "Do not include commentary.",
+        )
+        msgs.append({"role": "user", "content": followup})
+
+        # Phase 2: request final structured output. Tools are not passed so the
+        # model focuses on producing the schema-constrained answer.
+        _print(f"\n{DIM}{MAGENTA}  ⟐ follow-up turn with response_format applied{RESET}")
+        result = chat_completion(
+            url, msgs, tools=None, response_format=response_format, stream=stream
+        )
+        final_content = result["content"] if result else None
+    else:
+        print_fail(f"Unknown apply_stage: {apply_stage}")
+        return False
+
+    if final_content is None:
+        print_fail("No final content from server.")
+        return False
+
+    parsed = _try_parse_json(final_content)
+    if parsed is None:
+        print_fail(f"Final content is not valid JSON: {final_content[:200]!r}")
+        return False
+
+    passed, reason = test_case["validate"](parsed, all_tcs, final_content)
+    if passed:
+        print_pass(reason)
+    else:
+        print_fail(reason)
+    return passed
+
+
+# ---------------------------------------------------------------------------
+# Test case definitions
+# ---------------------------------------------------------------------------
+
+# ---- Test 1: Book metadata extraction (always / json_schema) ----
+
+_BOOK_SCHEMA = {
+    "type": "json_schema",
+    "json_schema": {
+        "name": "book_metadata",
+        "strict": True,
+        "schema": {
+            "type": "object",
+            "additionalProperties": False,
+            "properties": {
+                "title": {"type": "string"},
+                "author": {"type": "string"},
+                "year": {"type": "integer"},
+                "genre": {
+                    "type": "string",
+                    "enum": [
+                        "fiction",
+                        "non-fiction",
+                        "fantasy",
+                        "sci-fi",
+                        "mystery",
+                        "biography",
+                        "history",
+                        "other",
+                    ],
+                },
+                "page_count": {"type": "integer"},
+            },
+            "required": ["title", "author", "year", "genre", "page_count"],
+        },
+    },
+}
+
+BOOK_TEST_CASE = {
+    "name": "Book metadata extraction (json_schema, always)",
+    "response_format": _BOOK_SCHEMA,
+    "apply_stage": "always",
+    "messages": [
+        {
+            "role": "user",
+            "content": (
+                "Extract book metadata from this description: "
+                "'Dune is a 1965 science fiction epic by Frank Herbert, spanning roughly "
+                "688 pages in its first edition, set on the desert planet Arrakis.' "
+                "Return the data as JSON."
+            ),
+        }
+    ],
+    "validate": lambda parsed, tcs, raw: _validate_book(parsed),
+}
+
+
+def _validate_book(parsed):
+    required = {"title", "author", "year", "genre", "page_count"}
+    missing = required - parsed.keys()
+    if missing:
+        return False, f"Missing fields: {missing}"
+    if not isinstance(parsed["title"], str) or not parsed["title"]:
+        return False, "title must be a non-empty string"
+    if not isinstance(parsed["author"], str) or "herbert" not in parsed["author"].lower():
+        return False, f"author unexpected: {parsed['author']!r}"
+    if not isinstance(parsed["year"], int) or parsed["year"] != 1965:
+        return False, f"year should be 1965, got {parsed['year']!r}"
+    if parsed["genre"] not in {
+        "fiction", "non-fiction", "fantasy", "sci-fi", "mystery",
+        "biography", "history", "other",
+    }:
+        return False, f"genre not in enum: {parsed['genre']!r}"
+    if not isinstance(parsed["page_count"], int) or parsed["page_count"] <= 0:
+        return False, f"page_count should be positive int: {parsed['page_count']!r}"
+    return True, f"Book: {parsed['title']} ({parsed['year']}) / {parsed['genre']}"
+
+
+# ---- Test 2: Sentiment classification (always / enum-constrained) ----
+
+_SENTIMENT_SCHEMA = {
+    "type": "json_schema",
+    "json_schema": {
+        "name": "sentiment_analysis",
+        "strict": True,
+        "schema": {
+            "type": "object",
+            "additionalProperties": False,
+            "properties": {
+                "sentiment": {
+                    "type": "string",
+                    "enum": ["positive", "negative", "neutral"],
+                },
+                "confidence": {"type": "number"},
+                "keywords": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "minItems": 1,
+                    "maxItems": 5,
+                },
+            },
+            "required": ["sentiment", "confidence", "keywords"],
+        },
+    },
+}
+
+SENTIMENT_TEST_CASE = {
+    "name": "Sentiment analysis with enum and array",
+    "response_format": _SENTIMENT_SCHEMA,
+    "apply_stage": "always",
+    "messages": [
+        {
+            "role": "user",
+            "content": (
+                "Analyse the sentiment of this review and return JSON with the "
+                "detected sentiment label, a confidence score between 0 and 1, "
+                "and up to five keyword strings that drove the classification:\n\n"
+                "'This product completely exceeded my expectations. The build "
+                "quality is phenomenal, it arrived a day early, and customer "
+                "support was delightful when I had a setup question.'"
+            ),
+        }
+    ],
+    "validate": lambda parsed, tcs, raw: _validate_sentiment(parsed),
+}
+
+
+def _validate_sentiment(parsed):
+    if parsed.get("sentiment") not in {"positive", "negative", "neutral"}:
+        return False, f"sentiment not in enum: {parsed.get('sentiment')!r}"
+    if parsed["sentiment"] != "positive":
+        return False, f"expected positive sentiment, got {parsed['sentiment']}"
+    conf = parsed.get("confidence")
+    if not isinstance(conf, (int, float)) or not (0.0 <= conf <= 1.0):
+        return False, f"confidence not in [0,1]: {conf!r}"
+    kws = parsed.get("keywords")
+    if not isinstance(kws, list) or not (1 <= len(kws) <= 5):
+        return False, f"keywords length out of range: {kws!r}"
+    if not all(isinstance(k, str) and k for k in kws):
+        return False, f"keywords must be non-empty strings: {kws!r}"
+    return True, f"sentiment={parsed['sentiment']} conf={conf} kws={kws}"
+
+
+# ---- Test 3: Nested recipe schema (always) ----
+
+_RECIPE_SCHEMA = {
+    "type": "json_schema",
+    "json_schema": {
+        "name": "recipe",
+        "strict": True,
+        "schema": {
+            "type": "object",
+            "additionalProperties": False,
+            "properties": {
+                "name": {"type": "string"},
+                "servings": {"type": "integer"},
+                "ingredients": {
+                    "type": "array",
+                    "minItems": 2,
+                    "items": {
+                        "type": "object",
+                        "additionalProperties": False,
+                        "properties": {
+                            "item": {"type": "string"},
+                            "quantity": {"type": "string"},
+                        },
+                        "required": ["item", "quantity"],
+                    },
+                },
+                "steps": {
+                    "type": "array",
+                    "minItems": 2,
+                    "items": {"type": "string"},
+                },
+                "prep_time_minutes": {"type": "integer"},
+            },
+            "required": ["name", "servings", "ingredients", "steps", "prep_time_minutes"],
+        },
+    },
+}
+
+RECIPE_TEST_CASE = {
+    "name": "Nested recipe with arrays of objects",
+    "response_format": _RECIPE_SCHEMA,
+    "apply_stage": "always",
+    "messages": [
+        {
+            "role": "user",
+            "content": (
+                "Give me a simple 4-serving scrambled eggs recipe as structured JSON. "
+                "Include the recipe name, servings, ingredients (each with item and "
+                "quantity), preparation steps, and total prep time in minutes."
+            ),
+        }
+    ],
+    "validate": lambda parsed, tcs, raw: _validate_recipe(parsed),
+}
+
+
+def _validate_recipe(parsed):
+    required = {"name", "servings", "ingredients", "steps", "prep_time_minutes"}
+    missing = required - parsed.keys()
+    if missing:
+        return False, f"Missing fields: {missing}"
+    if not isinstance(parsed["name"], str) or not parsed["name"]:
+        return False, "name must be a non-empty string"
+    if not isinstance(parsed["servings"], int) or parsed["servings"] <= 0:
+        return False, f"servings must be positive int: {parsed['servings']!r}"
+    ings = parsed["ingredients"]
+    if not isinstance(ings, list) or len(ings) < 2:
+        return False, f"ingredients must be array of >=2: got {ings!r}"
+    for i, ing in enumerate(ings):
+        if not isinstance(ing, dict):
+            return False, f"ingredient[{i}] is not an object: {ing!r}"
+        ing_d = cast(dict[str, Any], ing)
+        item_val = ing_d.get("item")
+        qty_val = ing_d.get("quantity")
+        if item_val is None or qty_val is None:
+            return False, f"ingredient[{i}] missing item/quantity: {ing!r}"
+        if not isinstance(item_val, str) or not isinstance(qty_val, str):
+            return False, f"ingredient[{i}] fields must be strings: {ing!r}"
+    steps = parsed["steps"]
+    if not isinstance(steps, list) or len(steps) < 2:
+        return False, f"steps must be array of >=2 strings: got {steps!r}"
+    if not all(isinstance(s, str) and s for s in steps):
+        return False, "all steps must be non-empty strings"
+    pt = parsed["prep_time_minutes"]
+    if not isinstance(pt, int) or pt <= 0:
+        return False, f"prep_time_minutes must be positive int: {pt!r}"
+    return True, f"recipe '{parsed['name']}' with {len(ings)} ingredients, {len(steps)} steps"
+
+
+# ---- Test 4: Tool call -> structured product comparison (after_tools) ----
+
+_SHOP_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "search_products",
+            "description": "Search a product catalogue by keyword.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "query": {"type": "string"},
+                },
+                "required": ["query"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_product_details",
+            "description": "Get detailed specs for a product by ID.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "product_id": {"type": "string"},
+                },
+                "required": ["product_id"],
+            },
+        },
+    },
+]
+
+_SHOP_SEARCH_RESULT = {
+    "results": [
+        {"product_id": "LAP-001", "title": "AeroBook 13 Pro",       "price": 1399.0, "rating": 4.7},
+        {"product_id": "LAP-002", "title": "QuantumSlim 14",        "price": 1199.0, "rating": 4.4},
+        {"product_id": "LAP-003", "title": "NimbusWork Ultra 15",   "price":  999.0, "rating": 4.2},
+    ],
+}
+_SHOP_PRODUCT_DETAILS = {
+    "LAP-001": {
+        "product_id": "LAP-001",
+        "title": "AeroBook 13 Pro",
+        "cpu": "M-series 10-core",
+        "ram_gb": 16,
+        "storage_gb": 512,
+        "battery_hours": 18,
+        "weight_kg": 1.24,
+        "price": 1399.0,
+    },
+    "LAP-002": {
+        "product_id": "LAP-002",
+        "title": "QuantumSlim 14",
+        "cpu": "Core i7 12-core",
+        "ram_gb": 16,
+        "storage_gb": 512,
+        "battery_hours": 12,
+        "weight_kg": 1.35,
+        "price": 1199.0,
+    },
+    "LAP-003": {
+        "product_id": "LAP-003",
+        "title": "NimbusWork Ultra 15",
+        "cpu": "Ryzen 7 8-core",
+        "ram_gb": 16,
+        "storage_gb": 1024,
+        "battery_hours": 10,
+        "weight_kg": 1.70,
+        "price": 999.0,
+    },
+}
+
+
+def _shop_details_mock(args):
+    pid = args.get("product_id", "")
+    if pid in _SHOP_PRODUCT_DETAILS:
+        return json.dumps(_SHOP_PRODUCT_DETAILS[pid])
+    return json.dumps({"error": f"unknown product_id: {pid}"})
+
+
+_SHOP_COMPARISON_SCHEMA = {
+    "type": "json_schema",
+    "json_schema": {
+        "name": "laptop_comparison",
+        "strict": True,
+        "schema": {
+            "type": "object",
+            "additionalProperties": False,
+            "properties": {
+                "recommendation": {"type": "string"},
+                "ranked_candidates": {
+                    "type": "array",
+                    "minItems": 2,
+                    "items": {
+                        "type": "object",
+                        "additionalProperties": False,
+                        "properties": {
+                            "product_id": {"type": "string"},
+                            "title":      {"type": "string"},
+                            "score":      {"type": "number"},
+                            "reason":     {"type": "string"},
+                        },
+                        "required": ["product_id", "title", "score", "reason"],
+                    },
+                },
+            },
+            "required": ["recommendation", "ranked_candidates"],
+        },
+    },
+}
+
+SHOP_COMPARISON_TEST_CASE = {
+    "name": "Tool calls then structured laptop comparison (after_tools)",
+    "response_format": _SHOP_COMPARISON_SCHEMA,
+    "apply_stage": "after_tools",
+    "tools": _SHOP_TOOLS,
+    "mock_tool_responses": {
+        "search_products": lambda _: json.dumps(_SHOP_SEARCH_RESULT),
+        "get_product_details": _shop_details_mock,
+    },
+    "messages": [
+        {
+            "role": "user",
+            "content": (
+                "I need a lightweight laptop for travel. Please search the catalogue "
+                "for 'ultraportable laptop', then fetch detailed specs for at least two "
+                "of the top candidates. Once you've gathered the data I'll ask you to "
+                "produce a structured comparison."
+            ),
+        }
+    ],
+    "followup": (
+        "Thanks. Now produce the final comparison strictly as JSON matching the "
+        "laptop_comparison schema: your single best recommendation (the product_id), "
+        "and a ranked_candidates array of at least two laptops, each with "
+        "product_id, title, a numeric score, and a short reason."
+    ),
+    "validate": lambda parsed, tcs, raw: _validate_shop_comparison(parsed, tcs),
+}
+
+
+def _validate_shop_comparison(parsed, tcs):
+    names = [tc["function"]["name"] for tc in tcs]
+    if "search_products" not in names:
+        return False, f"expected search_products tool call, got {names}"
+    if "get_product_details" not in names:
+        return False, f"expected get_product_details tool call, got {names}"
+    if "recommendation" not in parsed or not isinstance(parsed["recommendation"], str):
+        return False, f"recommendation missing or not a string: {parsed!r}"
+    cands = parsed.get("ranked_candidates")
+    if not isinstance(cands, list) or len(cands) < 2:
+        return False, f"ranked_candidates must be >=2: {cands!r}"
+    valid_ids = set(_SHOP_PRODUCT_DETAILS.keys())
+    candidate_pids: list = []
+    for i, c in enumerate(cands):
+        if not isinstance(c, dict):
+            return False, f"candidate[{i}] not an object: {c!r}"
+        c_d = cast(dict[str, Any], c)
+        pid = c_d.get("product_id")
+        title = c_d.get("title")
+        score = c_d.get("score")
+        reason = c_d.get("reason")
+        for k, v in (("product_id", pid), ("title", title),
+                     ("score", score), ("reason", reason)):
+            if v is None:
+                return False, f"candidate[{i}] missing {k}: {c!r}"
+        if pid not in valid_ids:
+            return False, f"candidate[{i}].product_id not in catalogue: {pid!r}"
+        if not isinstance(score, (int, float)):
+            return False, f"candidate[{i}].score not numeric: {score!r}"
+        candidate_pids.append(pid)
+    recommendation = parsed["recommendation"]
+    if recommendation not in valid_ids and recommendation not in candidate_pids:
+        return False, f"recommendation {recommendation!r} not in candidates"
+    return True, (
+        f"tools={names}; recommended={parsed['recommendation']}; "
+        f"{len(cands)} ranked candidates"
+    )
+
+
+# ---- Test 5: Multi-step research then structured report (after_tools) ----
+
+_RESEARCH_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_country_stats",
+            "description": "Fetch basic statistics for a country (population, GDP, capital).",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "country": {"type": "string"},
+                },
+                "required": ["country"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_climate_info",
+            "description": "Fetch climate information for a country.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "country": {"type": "string"},
+                },
+                "required": ["country"],
+            },
+        },
+    },
+]
+
+_COUNTRY_STATS = {
+    "norway": {
+        "country": "Norway",
+        "capital": "Oslo",
+        "population": 5_480_000,
+        "gdp_usd_trillion": 0.48,
+        "currency": "NOK",
+    }
+}
+_CLIMATE_INFO = {
+    "norway": {
+        "country": "Norway",
+        "climate_zone": "subarctic / temperate coastal",
+        "avg_winter_temp_c": -4.5,
+        "avg_summer_temp_c": 16.0,
+        "annual_precipitation_mm": 1400,
+    }
+}
+
+
+def _country_stats_mock(args):
+    c = args.get("country", "").strip().lower()
+    if c in _COUNTRY_STATS:
+        return json.dumps(_COUNTRY_STATS[c])
+    return json.dumps({"error": f"unknown country: {c}"})
+
+
+def _climate_info_mock(args):
+    c = args.get("country", "").strip().lower()
+    if c in _CLIMATE_INFO:
+        return json.dumps(_CLIMATE_INFO[c])
+    return json.dumps({"error": f"unknown country: {c}"})
+
+
+_RESEARCH_REPORT_SCHEMA = {
+    "type": "json_schema",
+    "json_schema": {
+        "name": "country_report",
+        "strict": True,
+        "schema": {
+            "type": "object",
+            "additionalProperties": False,
+            "properties": {
+                "country": {"type": "string"},
+                "capital": {"type": "string"},
+                "population": {"type": "integer"},
+                "climate_summary": {"type": "string"},
+                "highlights": {
+                    "type": "array",
+                    "minItems": 2,
+                    "maxItems": 5,
+                    "items": {"type": "string"},
+                },
+                "suitable_for_tourism": {"type": "boolean"},
+            },
+            "required": [
+                "country", "capital", "population",
+                "climate_summary", "highlights", "suitable_for_tourism",
+            ],
+        },
+    },
+}
+
+COUNTRY_REPORT_TEST_CASE = {
+    "name": "Research pipeline then structured country report (after_tools)",
+    "response_format": _RESEARCH_REPORT_SCHEMA,
+    "apply_stage": "after_tools",
+    "tools": _RESEARCH_TOOLS,
+    "mock_tool_responses": {
+        "get_country_stats": _country_stats_mock,
+        "get_climate_info": _climate_info_mock,
+    },
+    "messages": [
+        {
+            "role": "user",
+            "content": (
+                "I'm preparing a short briefing on Norway. Please call the "
+                "get_country_stats and get_climate_info tools to gather data "
+                "first. Afterwards I'll ask for a structured summary."
+            ),
+        }
+    ],
+    "followup": (
+        "Based on the tool results, produce the briefing as JSON matching the "
+        "country_report schema. Populate every required field and provide between "
+        "two and five highlights."
+    ),
+    "validate": lambda parsed, tcs, raw: _validate_country_report(parsed, tcs),
+}
+
+
+def _validate_country_report(parsed, tcs):
+    names = [tc["function"]["name"] for tc in tcs]
+    for required_tool in ("get_country_stats", "get_climate_info"):
+        if required_tool not in names:
+            return False, f"missing tool call {required_tool!r}: got {names}"
+    required = {
+        "country", "capital", "population",
+        "climate_summary", "highlights", "suitable_for_tourism",
+    }
+    missing = required - parsed.keys()
+    if missing:
+        return False, f"missing report fields: {missing}"
+    if "norway" not in parsed["country"].lower():
+        return False, f"country should reference Norway: {parsed['country']!r}"
+    if "oslo" not in parsed["capital"].lower():
+        return False, f"capital should be Oslo: {parsed['capital']!r}"
+    if not isinstance(parsed["population"], int) or parsed["population"] < 1_000_000:
+        return False, f"population implausible: {parsed['population']!r}"
+    if not isinstance(parsed["climate_summary"], str) or not parsed["climate_summary"]:
+        return False, "climate_summary must be a non-empty string"
+    hls = parsed["highlights"]
+    if not isinstance(hls, list) or not (2 <= len(hls) <= 5):
+        return False, f"highlights length out of range: {hls!r}"
+    if not all(isinstance(h, str) and h for h in hls):
+        return False, "each highlight must be a non-empty string"
+    if not isinstance(parsed["suitable_for_tourism"], bool):
+        return False, f"suitable_for_tourism must be bool: {parsed['suitable_for_tourism']!r}"
+    return True, (
+        f"tools={names}; report for {parsed['country']} "
+        f"(pop {parsed['population']}, {len(hls)} highlights)"
+    )
+
+
+# ---------------------------------------------------------------------------
+# All test cases
+# ---------------------------------------------------------------------------
+
+ALL_TEST_CASES = [
+    BOOK_TEST_CASE,
+    SENTIMENT_TEST_CASE,
+    RECIPE_TEST_CASE,
+    SHOP_COMPARISON_TEST_CASE,
+    COUNTRY_REPORT_TEST_CASE,
+]
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Test llama-server structured-output capability."
+    )
+    parser.add_argument("--host", default="localhost")
+    parser.add_argument("--port", default=8080, type=int)
+    parser.add_argument(
+        "--no-stream", action="store_true", help="Disable streaming mode tests"
+    )
+    parser.add_argument(
+        "--stream-only", action="store_true", help="Only run streaming mode tests"
+    )
+    parser.add_argument(
+        "--test",
+        help="Run only the test whose name contains this substring (case-insensitive)",
+    )
+    args = parser.parse_args()
+
+    url = f"http://{args.host}:{args.port}/v1/chat/completions"
+    print_info(f"Testing server at {url}")
+
+    modes: list[bool] = []
+    if not args.stream_only:
+        modes.append(False)
+    if not args.no_stream:
+        modes.append(True)
+
+    cases: list[dict] = ALL_TEST_CASES
+    if args.test:
+        name_filter = args.test.lower()
+        cases = [c for c in cases if name_filter in str(c["name"]).lower()]
+        if not cases:
+            print_fail(f"No test cases matched '{args.test}'")
+            sys.exit(1)
+
+    total = 0
+    passed = 0
+    for stream in modes:
+        for case in cases:
+            total += 1
+            if run_test(url, case, stream=stream):
+                passed += 1
+
+    color = GREEN if passed == total else RED
+    _print(f"\n{BOLD}{color}{'─' * 60}{RESET}")
+    _print(f"{BOLD}{color}  Results: {passed}/{total} passed{RESET}")
+    _print(f"{BOLD}{color}{'─' * 60}{RESET}\n")
+    sys.exit(0 if passed == total else 1)
+
+
+if __name__ == "__main__":
+    main()
@@ -207,6 +207,8 @@ struct cli_context {
        auto meta = ctx_server.get_meta();
        auto & chat_params = meta.chat_params;

+        auto caps = common_chat_templates_get_caps(chat_params.tmpls.get());
+
        common_chat_templates_inputs inputs;
        inputs.messages              = common_chat_msgs_parse_oaicompat(messages);
        inputs.tools                 = {}; // TODO
@@ -214,7 +216,7 @@ struct cli_context {
        inputs.json_schema           = ""; // TODO
        inputs.grammar               = ""; // TODO
        inputs.use_jinja             = chat_params.use_jinja;
-        inputs.parallel_tool_calls   = false;
+        inputs.parallel_tool_calls   = caps["supports_parallel_tool_calls"];
        inputs.add_generation_prompt = true;
        inputs.reasoning_format      = COMMON_REASONING_FORMAT_DEEPSEEK;
        inputs.force_pure_content    = chat_params.force_pure_content;
@@ -1027,6 +1027,8 @@ json oaicompat_chat_params_parse(
        }
    }

+    auto caps = common_chat_templates_get_caps(opt.tmpls.get());
+
    common_chat_templates_inputs inputs;
    inputs.messages              = common_chat_msgs_parse_oaicompat(messages);
    inputs.tools                 = common_chat_tools_parse_oaicompat(tools);
@@ -1034,7 +1036,7 @@ json oaicompat_chat_params_parse(
    inputs.json_schema           = json_schema.is_null() ? "" : json_schema.dump();
    inputs.grammar               = grammar;
    inputs.use_jinja             = opt.use_jinja;
-    inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", false);
+    inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", caps["supports_parallel_tool_calls"]);
    inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
    inputs.reasoning_format      = opt.reasoning_format;
    if (body.contains("reasoning_format")) {