From 115504a7e5d19af315164606841dc5aecb6d7a73 Mon Sep 17 00:00:00 2001 From: Carson Date: Tue, 6 Jan 2026 11:33:57 -0600 Subject: [PATCH 1/5] feat: add data_model parameter to to_solver() for structured output in evals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a data_model (Pydantic model) is provided to .to_solver(), the solver uses .chat_structured_async() instead of .chat_async() to generate responses. The resulting Pydantic model instance is serialized to JSON and set as the completion text in state.output.completion. This allows using chatlas for structured data extraction tasks in Inspect AI evaluations, where scorers can parse and validate the JSON output. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- CHANGELOG.md | 1 + chatlas/_chat.py | 27 +- ...ectIntegration.test_structured_output.yaml | 259 ++++++++++++++++++ tests/test_inspect.py | 40 +++ 4 files changed, 324 insertions(+), 3 deletions(-) create mode 100644 tests/_vcr/test_inspect/TestInspectIntegration.test_structured_output.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ef1011e..52930a62 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### New features * `.stream()` and `.stream_async()` now support a `data_model` parameter for structured data extraction while streaming. (#262) +* `.to_solver()` now supports a `data_model` parameter for structured data extraction in evals. When provided, the solver uses `.chat_structured()` instead of `.chat()` and outputs JSON-serialized data. ## [0.15.0] - 2026-01-06 diff --git a/chatlas/_chat.py b/chatlas/_chat.py index a0ee8f98..5288101e 100644 --- a/chatlas/_chat.py +++ b/chatlas/_chat.py @@ -834,6 +834,7 @@ def console( def to_solver( self, *, + data_model: type[BaseModel] | None = None, include_system_prompt: bool = False, include_turns: bool = False, ): @@ -847,6 +848,11 @@ def to_solver( Parameters ---------- + data_model + A Pydantic model describing the structure of the data to extract. + When provided, the solver will use `.chat_structured()` instead of + `.chat()` to generate responses, and the output completion will be + JSON serialized from the model instance. include_system_prompt Whether to include the system prompt in the solver's starting messages. @@ -977,8 +983,19 @@ async def solve(state: InspectTaskState, generate): input_content = [input_content] input_content = [inspect_content_as_chatlas(x) for x in input_content] - # Generate the response (this can generate multiple turns!) - await chat_instance.chat_async(*input_content, echo="none") + # Generate the response + # When data_model is provided, use chat_structured_async() for + # structured output; otherwise use chat_async() which can handle + # tool calling loops. + if data_model is not None: + result = await chat_instance.chat_structured_async( + *input_content, data_model=data_model, echo="none" + ) + completion_text = result.model_dump_json() + else: + # This can generate multiple turns via tool calling + await chat_instance.chat_async(*input_content, echo="none") + completion_text = None # Will be set from turns[-1].text # Map change in chatlas Turn state back to Inspect message.state # (Note: we skip the user prompt turn since it's already included) @@ -1001,10 +1018,14 @@ async def solve(state: InspectTaskState, generate): "Expected the last message in InspectAI state to be an assistant message" ) + # Use the structured JSON output if available, otherwise the text + if completion_text is None: + completion_text = turns[-1].text + state.output = imodel.ModelOutput( model=model, choices=[imodel.ChatCompletionChoice(message=last_message)], - completion=turns[-1].text, + completion=completion_text, usage=usage, time=time.perf_counter() - start_time, ) diff --git a/tests/_vcr/test_inspect/TestInspectIntegration.test_structured_output.yaml b/tests/_vcr/test_inspect/TestInspectIntegration.test_structured_output.yaml new file mode 100644 index 00000000..b6d791d6 --- /dev/null +++ b/tests/_vcr/test_inspect/TestInspectIntegration.test_structured_output.yaml @@ -0,0 +1,259 @@ +interactions: +- request: + body: '{"max_tokens": 4096, "messages": [{"role": "user", "content": [{"text": + "John Smith is 42 years old.", "type": "text", "cache_control": {"type": "ephemeral", + "ttl": "5m"}}]}], "model": "claude-haiku-4-5-20251001", "stream": false, "system": + [{"type": "text", "text": "Extract person information from the text. Return + only the structured data.", "cache_control": {"type": "ephemeral", "ttl": "5m"}}], + "tool_choice": {"type": "tool", "name": "_structured_tool_call"}, "tools": [{"name": + "_structured_tool_call", "input_schema": {"type": "object", "properties": {"data": + {"properties": {"name": {"type": "string"}, "age": {"type": "integer"}}, "required": + ["name", "age"], "type": "object", "additionalProperties": false}}}, "description": + "Extract structured data"}]}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '708' + Content-Type: + - application/json + Host: + - api.anthropic.com + X-Stainless-Async: + - async:asyncio + anthropic-version: + - '2023-06-01' + x-stainless-read-timeout: + - '600' + x-stainless-timeout: + - '600' + method: POST + uri: https://api.anthropic.com/v1/messages + response: + body: + string: '{"model":"claude-haiku-4-5-20251001","id":"msg_011ppdxvaxeQ3E5pouB6zmeT","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01Ji9YiEGtUtDtY3ezUbierv","name":"_structured_tool_call","input":{"data":{"name":"John + Smith","age":42}}}],"stop_reason":"tool_use","stop_sequence":null,"usage":{"input_tokens":711,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":44,"service_tier":"standard"}}' + headers: + CF-RAY: + - 9b9cf181cccfe75f-DEN + Connection: + - keep-alive + Content-Type: + - application/json + Date: + - Tue, 06 Jan 2026 17:30:28 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Robots-Tag: + - none + anthropic-ratelimit-input-tokens-limit: + - '4000000' + anthropic-ratelimit-input-tokens-remaining: + - '4000000' + anthropic-ratelimit-input-tokens-reset: + - '2026-01-06T17:30:27Z' + anthropic-ratelimit-output-tokens-limit: + - '800000' + anthropic-ratelimit-output-tokens-remaining: + - '800000' + anthropic-ratelimit-output-tokens-reset: + - '2026-01-06T17:30:28Z' + anthropic-ratelimit-requests-limit: + - '4000' + anthropic-ratelimit-requests-remaining: + - '3999' + anthropic-ratelimit-requests-reset: + - '2026-01-06T17:30:27Z' + anthropic-ratelimit-tokens-limit: + - '4800000' + anthropic-ratelimit-tokens-remaining: + - '4800000' + anthropic-ratelimit-tokens-reset: + - '2026-01-06T17:30:27Z' + cf-cache-status: + - DYNAMIC + content-length: + - '517' + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-envoy-upstream-service-time: + - '908' + status: + code: 200 + message: OK +- request: + body: '{"max_tokens": 32000, "messages": [{"role": "user", "content": "\nYou are + assessing a submitted answer on a given task based on a criterion. Here is the + data:\n\n[BEGIN DATA]\n***\n[Task]: John Smith is 42 years old.\n***\n[Submission]: + {\"name\":\"John Smith\",\"age\":42}\n***\n[Criterion]: {\"name\": \"John Smith\", + \"age\": 42}\n***\n[END DATA]\n\nDoes the submission meet the criterion?\n\n\nAfter + assessing the submitted answer, reply with ''GRADE: $LETTER'' (without quotes) + where LETTER is one of CI. Please choose ONE option for the grade: either \"C\" + for correct answers, or \"I\" for incorrect answers.\n\nFor example, after reviewing + a correct answer you might write ''GRADE: C'' or after reviewing an incorrect + answer you might write ''GRADE: I''.\n\nFirst, write out in a step by step manner + your reasoning about the criterion to be sure that your conclusion is correct. + Avoid simply stating the correct answers at the outset. Then, end with your + answer formatted as ''GRADE: $LETTER'' (without quotes) where LETTER is one + of CI.\n\n"}], "model": "claude-haiku-4-5-20251001", "tools": [], "stream": + true}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '1101' + Content-Type: + - application/json + Host: + - api.anthropic.com + X-Stainless-Async: + - async:asyncio + X-Stainless-Helper-Method: + - stream + X-Stainless-Stream-Helper: + - messages + anthropic-version: + - '2023-06-01' + x-irid: + - fGWumscTVoUzHYbrDFdkF6 + x-stainless-read-timeout: + - '600' + x-stainless-timeout: + - NOT_GIVEN + method: POST + uri: https://api.anthropic.com/v1/messages + response: + body: + string: "event: message_start\ndata: {\"type\":\"message_start\",\"message\":{\"model\":\"claude-haiku-4-5-20251001\",\"id\":\"msg_01Ez5kNaynHQCNnUh1S2WnBL\",\"type\":\"message\",\"role\":\"assistant\",\"content\":[],\"stop_reason\":null,\"stop_sequence\":null,\"usage\":{\"input_tokens\":259,\"cache_creation_input_tokens\":0,\"cache_read_input_tokens\":0,\"cache_creation\":{\"ephemeral_5m_input_tokens\":0,\"ephemeral_1h_input_tokens\":0},\"output_tokens\":1,\"service_tier\":\"standard\"}}}\n\nevent: + content_block_start\ndata: {\"type\":\"content_block_start\",\"index\":0,\"content_block\":{\"type\":\"text\",\"text\":\"\"} + \ }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"Let\"} + \ }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + me assess\"} }\n\nevent: ping\ndata: {\"type\": \"ping\"}\n\nevent: content_block_delta\ndata: + {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + this step\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + by step.\\n\\n**\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"Comparing\"} + }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + the\"}}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + Submission to the Criterion:**\\n\\nThe\"} }\n\nevent: content_block_delta\ndata: + {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + criterion\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + spec\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"ifies:\"} + \ }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\\n- + name\"}}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\": + \\\"John Smith\\\"\\n- age\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\": + 42\\n\\nThe submission provides:\"} }\n\nevent: content_block_delta\ndata: + {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\\n- + name: \\\"John Smith\\\"\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\\n- + age: 42\\n\\n**\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"Checking + each\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + fiel\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"d:**\\n\\n1. + **Name fiel\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"d**: + The\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + submission\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + has \\\"John Smith\\\" an\"} }\n\nevent: content_block_delta\ndata: + {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"d + the criterion requires\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + \\\"John Smith\\\"\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + \"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\u2713\"} + \ }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + Match\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\\n2. + **Age field**: The submission\"} }\n\nevent: content_block_delta\ndata: + {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + has \"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"42 + and the criterion requires 42\"} }\n\nevent: content_block_delta\ndata: + {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + \u2713 Match\\n3\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\". + **Format\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"**: + Both\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + are in JSON\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + object\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + format with\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + the\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + same structure \"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\u2713 + Match\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\\n4. + **Data\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + types\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"**: + Both fields\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + match\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + in\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + type (\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"string + for\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + name, number for age) \"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\u2713 + Match\\n\\n**Conclusion:**\"} }\n\nevent: content_block_delta\ndata: + {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\\n\\nThe\"} + \ }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + submission exactly\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + matches all\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + requirements\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + specifie\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"d + in the criterion.\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + Both\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + the\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + fiel\"}}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"d + names and their\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + corresponding\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + values are identical.\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\\n\\nGRADE: + C\"} }\n\nevent: content_block_stop\ndata: {\"type\":\"content_block_stop\",\"index\":0 + \ }\n\nevent: message_delta\ndata: {\"type\":\"message_delta\",\"delta\":{\"stop_reason\":\"end_turn\",\"stop_sequence\":null},\"usage\":{\"input_tokens\":259,\"cache_creation_input_tokens\":0,\"cache_read_input_tokens\":0,\"output_tokens\":201} + \ }\n\nevent: message_stop\ndata: {\"type\":\"message_stop\" }\n\n" + headers: + CF-RAY: + - 9b9cf189adb07c34-DEN + Cache-Control: + - no-cache + Connection: + - keep-alive + Content-Type: + - text/event-stream; charset=utf-8 + Date: + - Tue, 06 Jan 2026 17:30:29 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Robots-Tag: + - none + anthropic-ratelimit-input-tokens-limit: + - '4000000' + anthropic-ratelimit-input-tokens-remaining: + - '4000000' + anthropic-ratelimit-input-tokens-reset: + - '2026-01-06T17:30:28Z' + anthropic-ratelimit-output-tokens-limit: + - '800000' + anthropic-ratelimit-output-tokens-remaining: + - '800000' + anthropic-ratelimit-output-tokens-reset: + - '2026-01-06T17:30:28Z' + anthropic-ratelimit-requests-limit: + - '4000' + anthropic-ratelimit-requests-remaining: + - '3999' + anthropic-ratelimit-requests-reset: + - '2026-01-06T17:30:28Z' + anthropic-ratelimit-tokens-limit: + - '4800000' + anthropic-ratelimit-tokens-remaining: + - '4800000' + anthropic-ratelimit-tokens-reset: + - '2026-01-06T17:30:28Z' + cf-cache-status: + - DYNAMIC + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-envoy-upstream-service-time: + - '290' + status: + code: 200 + message: OK +version: 1 diff --git a/tests/test_inspect.py b/tests/test_inspect.py index 95fc6e84..c2c980fb 100644 --- a/tests/test_inspect.py +++ b/tests/test_inspect.py @@ -1,8 +1,10 @@ import datetime +import json import sys from unittest.mock import patch import pytest +from pydantic import BaseModel from chatlas import AssistantTurn, Chat, ChatAnthropic, ContentToolRequest, Turn, UserTurn from chatlas._content import ( @@ -349,6 +351,44 @@ def get_current_date(): accuracy = results.scores[0].metrics["accuracy"].value assert accuracy == 1, f"Expected accuracy of 1, but got {accuracy}" + @pytest.mark.vcr + def test_structured_output(self): + """Test that to_solver() works with data_model for structured output.""" + + class Person(BaseModel): + name: str + age: int + + chat = chat_func( + system_prompt="Extract person information from the text. Return only the structured data." + ) + + # Create task with data_model parameter + task = Task( + dataset=[ + Sample( + input="John Smith is 42 years old.", + target='{"name": "John Smith", "age": 42}', + ) + ], + solver=chat.to_solver(data_model=Person), + scorer=model_graded_qa(model=SCORER_MODEL), + ) + + log = inspect_eval(task)[0] + results = log.results + + assert results is not None + assert log.samples is not None + + # Verify the output is valid JSON matching our model + sample = log.samples[0] + assert sample.output.completion is not None + completion = sample.output.completion + parsed = json.loads(completion) + assert parsed["name"] == "John Smith" + assert parsed["age"] == 42 + # Skip VCR for multi-sample tests - response ordering with VCR is unreliable # when body matching is disabled (required due to dynamic IDs in requests) @pytest.mark.skipif( From d10c138749db7376ad1baab619567a41a0281612 Mon Sep 17 00:00:00 2001 From: Carson Date: Tue, 6 Jan 2026 11:34:49 -0600 Subject: [PATCH 2/5] docs: add PR number to CHANGELOG entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 52930a62..f8f7217a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### New features * `.stream()` and `.stream_async()` now support a `data_model` parameter for structured data extraction while streaming. (#262) -* `.to_solver()` now supports a `data_model` parameter for structured data extraction in evals. When provided, the solver uses `.chat_structured()` instead of `.chat()` and outputs JSON-serialized data. +* `.to_solver()` now supports a `data_model` parameter for structured data extraction in evals. When provided, the solver uses `.chat_structured()` instead of `.chat()` and outputs JSON-serialized data. (#264) ## [0.15.0] - 2026-01-06 From 5040ec9e08bb40984521aebe35541233ff509032 Mon Sep 17 00:00:00 2001 From: Carson Date: Tue, 6 Jan 2026 11:41:58 -0600 Subject: [PATCH 3/5] refactor: consolidate completion logic in to_solver() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move all completion text determination to one place, right before setting state.output. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- chatlas/_chat.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/chatlas/_chat.py b/chatlas/_chat.py index 5288101e..10e61a87 100644 --- a/chatlas/_chat.py +++ b/chatlas/_chat.py @@ -987,15 +987,13 @@ async def solve(state: InspectTaskState, generate): # When data_model is provided, use chat_structured_async() for # structured output; otherwise use chat_async() which can handle # tool calling loops. + structured_result: BaseModel | None = None if data_model is not None: - result = await chat_instance.chat_structured_async( + structured_result = await chat_instance.chat_structured_async( *input_content, data_model=data_model, echo="none" ) - completion_text = result.model_dump_json() else: - # This can generate multiple turns via tool calling await chat_instance.chat_async(*input_content, echo="none") - completion_text = None # Will be set from turns[-1].text # Map change in chatlas Turn state back to Inspect message.state # (Note: we skip the user prompt turn since it's already included) @@ -1018,14 +1016,17 @@ async def solve(state: InspectTaskState, generate): "Expected the last message in InspectAI state to be an assistant message" ) - # Use the structured JSON output if available, otherwise the text - if completion_text is None: - completion_text = turns[-1].text + # Determine completion text: use structured JSON if available, + # otherwise use the text from the last turn + if structured_result is not None: + completion = structured_result.model_dump_json() + else: + completion = turns[-1].text state.output = imodel.ModelOutput( model=model, choices=[imodel.ChatCompletionChoice(message=last_message)], - completion=completion_text, + completion=completion, usage=usage, time=time.perf_counter() - start_time, ) From 325d4d99f91db0387bc33e9e5c552d4142542411 Mon Sep 17 00:00:00 2001 From: Carson Sievert Date: Tue, 6 Jan 2026 11:43:36 -0600 Subject: [PATCH 4/5] Apply suggestions from code review --- chatlas/_chat.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/chatlas/_chat.py b/chatlas/_chat.py index 10e61a87..6ab73c0a 100644 --- a/chatlas/_chat.py +++ b/chatlas/_chat.py @@ -984,9 +984,6 @@ async def solve(state: InspectTaskState, generate): input_content = [inspect_content_as_chatlas(x) for x in input_content] # Generate the response - # When data_model is provided, use chat_structured_async() for - # structured output; otherwise use chat_async() which can handle - # tool calling loops. structured_result: BaseModel | None = None if data_model is not None: structured_result = await chat_instance.chat_structured_async( @@ -1016,8 +1013,6 @@ async def solve(state: InspectTaskState, generate): "Expected the last message in InspectAI state to be an assistant message" ) - # Determine completion text: use structured JSON if available, - # otherwise use the text from the last turn if structured_result is not None: completion = structured_result.model_dump_json() else: From e027701515f58ee23695dc71286b5e8c833ed470 Mon Sep 17 00:00:00 2001 From: Carson Sievert Date: Tue, 6 Jan 2026 11:44:41 -0600 Subject: [PATCH 5/5] Update chatlas/_chat.py --- chatlas/_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chatlas/_chat.py b/chatlas/_chat.py index 6ab73c0a..709f5bc0 100644 --- a/chatlas/_chat.py +++ b/chatlas/_chat.py @@ -834,9 +834,9 @@ def console( def to_solver( self, *, - data_model: type[BaseModel] | None = None, include_system_prompt: bool = False, include_turns: bool = False, + data_model: type[BaseModel] | None = None, ): """ Create an InspectAI solver from this chat.