diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ef1011e..f8f7217a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### New features * `.stream()` and `.stream_async()` now support a `data_model` parameter for structured data extraction while streaming. (#262) +* `.to_solver()` now supports a `data_model` parameter for structured data extraction in evals. When provided, the solver uses `.chat_structured()` instead of `.chat()` and outputs JSON-serialized data. (#264) ## [0.15.0] - 2026-01-06 diff --git a/chatlas/_chat.py b/chatlas/_chat.py index a0ee8f98..709f5bc0 100644 --- a/chatlas/_chat.py +++ b/chatlas/_chat.py @@ -836,6 +836,7 @@ def to_solver( *, include_system_prompt: bool = False, include_turns: bool = False, + data_model: type[BaseModel] | None = None, ): """ Create an InspectAI solver from this chat. @@ -847,6 +848,11 @@ def to_solver( Parameters ---------- + data_model + A Pydantic model describing the structure of the data to extract. + When provided, the solver will use `.chat_structured()` instead of + `.chat()` to generate responses, and the output completion will be + JSON serialized from the model instance. include_system_prompt Whether to include the system prompt in the solver's starting messages. @@ -977,8 +983,14 @@ async def solve(state: InspectTaskState, generate): input_content = [input_content] input_content = [inspect_content_as_chatlas(x) for x in input_content] - # Generate the response (this can generate multiple turns!) - await chat_instance.chat_async(*input_content, echo="none") + # Generate the response + structured_result: BaseModel | None = None + if data_model is not None: + structured_result = await chat_instance.chat_structured_async( + *input_content, data_model=data_model, echo="none" + ) + else: + await chat_instance.chat_async(*input_content, echo="none") # Map change in chatlas Turn state back to Inspect message.state # (Note: we skip the user prompt turn since it's already included) @@ -1001,10 +1013,15 @@ async def solve(state: InspectTaskState, generate): "Expected the last message in InspectAI state to be an assistant message" ) + if structured_result is not None: + completion = structured_result.model_dump_json() + else: + completion = turns[-1].text + state.output = imodel.ModelOutput( model=model, choices=[imodel.ChatCompletionChoice(message=last_message)], - completion=turns[-1].text, + completion=completion, usage=usage, time=time.perf_counter() - start_time, ) diff --git a/tests/_vcr/test_inspect/TestInspectIntegration.test_structured_output.yaml b/tests/_vcr/test_inspect/TestInspectIntegration.test_structured_output.yaml new file mode 100644 index 00000000..b6d791d6 --- /dev/null +++ b/tests/_vcr/test_inspect/TestInspectIntegration.test_structured_output.yaml @@ -0,0 +1,259 @@ +interactions: +- request: + body: '{"max_tokens": 4096, "messages": [{"role": "user", "content": [{"text": + "John Smith is 42 years old.", "type": "text", "cache_control": {"type": "ephemeral", + "ttl": "5m"}}]}], "model": "claude-haiku-4-5-20251001", "stream": false, "system": + [{"type": "text", "text": "Extract person information from the text. Return + only the structured data.", "cache_control": {"type": "ephemeral", "ttl": "5m"}}], + "tool_choice": {"type": "tool", "name": "_structured_tool_call"}, "tools": [{"name": + "_structured_tool_call", "input_schema": {"type": "object", "properties": {"data": + {"properties": {"name": {"type": "string"}, "age": {"type": "integer"}}, "required": + ["name", "age"], "type": "object", "additionalProperties": false}}}, "description": + "Extract structured data"}]}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '708' + Content-Type: + - application/json + Host: + - api.anthropic.com + X-Stainless-Async: + - async:asyncio + anthropic-version: + - '2023-06-01' + x-stainless-read-timeout: + - '600' + x-stainless-timeout: + - '600' + method: POST + uri: https://api.anthropic.com/v1/messages + response: + body: + string: '{"model":"claude-haiku-4-5-20251001","id":"msg_011ppdxvaxeQ3E5pouB6zmeT","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_01Ji9YiEGtUtDtY3ezUbierv","name":"_structured_tool_call","input":{"data":{"name":"John + Smith","age":42}}}],"stop_reason":"tool_use","stop_sequence":null,"usage":{"input_tokens":711,"cache_creation_input_tokens":0,"cache_read_input_tokens":0,"cache_creation":{"ephemeral_5m_input_tokens":0,"ephemeral_1h_input_tokens":0},"output_tokens":44,"service_tier":"standard"}}' + headers: + CF-RAY: + - 9b9cf181cccfe75f-DEN + Connection: + - keep-alive + Content-Type: + - application/json + Date: + - Tue, 06 Jan 2026 17:30:28 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Robots-Tag: + - none + anthropic-ratelimit-input-tokens-limit: + - '4000000' + anthropic-ratelimit-input-tokens-remaining: + - '4000000' + anthropic-ratelimit-input-tokens-reset: + - '2026-01-06T17:30:27Z' + anthropic-ratelimit-output-tokens-limit: + - '800000' + anthropic-ratelimit-output-tokens-remaining: + - '800000' + anthropic-ratelimit-output-tokens-reset: + - '2026-01-06T17:30:28Z' + anthropic-ratelimit-requests-limit: + - '4000' + anthropic-ratelimit-requests-remaining: + - '3999' + anthropic-ratelimit-requests-reset: + - '2026-01-06T17:30:27Z' + anthropic-ratelimit-tokens-limit: + - '4800000' + anthropic-ratelimit-tokens-remaining: + - '4800000' + anthropic-ratelimit-tokens-reset: + - '2026-01-06T17:30:27Z' + cf-cache-status: + - DYNAMIC + content-length: + - '517' + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-envoy-upstream-service-time: + - '908' + status: + code: 200 + message: OK +- request: + body: '{"max_tokens": 32000, "messages": [{"role": "user", "content": "\nYou are + assessing a submitted answer on a given task based on a criterion. Here is the + data:\n\n[BEGIN DATA]\n***\n[Task]: John Smith is 42 years old.\n***\n[Submission]: + {\"name\":\"John Smith\",\"age\":42}\n***\n[Criterion]: {\"name\": \"John Smith\", + \"age\": 42}\n***\n[END DATA]\n\nDoes the submission meet the criterion?\n\n\nAfter + assessing the submitted answer, reply with ''GRADE: $LETTER'' (without quotes) + where LETTER is one of CI. Please choose ONE option for the grade: either \"C\" + for correct answers, or \"I\" for incorrect answers.\n\nFor example, after reviewing + a correct answer you might write ''GRADE: C'' or after reviewing an incorrect + answer you might write ''GRADE: I''.\n\nFirst, write out in a step by step manner + your reasoning about the criterion to be sure that your conclusion is correct. + Avoid simply stating the correct answers at the outset. Then, end with your + answer formatted as ''GRADE: $LETTER'' (without quotes) where LETTER is one + of CI.\n\n"}], "model": "claude-haiku-4-5-20251001", "tools": [], "stream": + true}' + headers: + Accept: + - application/json + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '1101' + Content-Type: + - application/json + Host: + - api.anthropic.com + X-Stainless-Async: + - async:asyncio + X-Stainless-Helper-Method: + - stream + X-Stainless-Stream-Helper: + - messages + anthropic-version: + - '2023-06-01' + x-irid: + - fGWumscTVoUzHYbrDFdkF6 + x-stainless-read-timeout: + - '600' + x-stainless-timeout: + - NOT_GIVEN + method: POST + uri: https://api.anthropic.com/v1/messages + response: + body: + string: "event: message_start\ndata: {\"type\":\"message_start\",\"message\":{\"model\":\"claude-haiku-4-5-20251001\",\"id\":\"msg_01Ez5kNaynHQCNnUh1S2WnBL\",\"type\":\"message\",\"role\":\"assistant\",\"content\":[],\"stop_reason\":null,\"stop_sequence\":null,\"usage\":{\"input_tokens\":259,\"cache_creation_input_tokens\":0,\"cache_read_input_tokens\":0,\"cache_creation\":{\"ephemeral_5m_input_tokens\":0,\"ephemeral_1h_input_tokens\":0},\"output_tokens\":1,\"service_tier\":\"standard\"}}}\n\nevent: + content_block_start\ndata: {\"type\":\"content_block_start\",\"index\":0,\"content_block\":{\"type\":\"text\",\"text\":\"\"} + \ }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"Let\"} + \ }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + me assess\"} }\n\nevent: ping\ndata: {\"type\": \"ping\"}\n\nevent: content_block_delta\ndata: + {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + this step\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + by step.\\n\\n**\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"Comparing\"} + }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + the\"}}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + Submission to the Criterion:**\\n\\nThe\"} }\n\nevent: content_block_delta\ndata: + {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + criterion\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + spec\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"ifies:\"} + \ }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\\n- + name\"}}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\": + \\\"John Smith\\\"\\n- age\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\": + 42\\n\\nThe submission provides:\"} }\n\nevent: content_block_delta\ndata: + {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\\n- + name: \\\"John Smith\\\"\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\\n- + age: 42\\n\\n**\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"Checking + each\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + fiel\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"d:**\\n\\n1. + **Name fiel\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"d**: + The\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + submission\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + has \\\"John Smith\\\" an\"} }\n\nevent: content_block_delta\ndata: + {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"d + the criterion requires\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + \\\"John Smith\\\"\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + \"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\u2713\"} + \ }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + Match\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\\n2. + **Age field**: The submission\"} }\n\nevent: content_block_delta\ndata: + {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + has \"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"42 + and the criterion requires 42\"} }\n\nevent: content_block_delta\ndata: + {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + \u2713 Match\\n3\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\". + **Format\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"**: + Both\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + are in JSON\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + object\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + format with\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + the\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + same structure \"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\u2713 + Match\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\\n4. + **Data\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + types\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"**: + Both fields\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + match\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + in\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + type (\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"string + for\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + name, number for age) \"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\u2713 + Match\\n\\n**Conclusion:**\"} }\n\nevent: content_block_delta\ndata: + {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\\n\\nThe\"} + \ }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + submission exactly\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + matches all\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + requirements\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + specifie\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"d + in the criterion.\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + Both\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + the\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + fiel\"}}\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"d + names and their\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + corresponding\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\" + values are identical.\"} }\n\nevent: content_block_delta\ndata: {\"type\":\"content_block_delta\",\"index\":0,\"delta\":{\"type\":\"text_delta\",\"text\":\"\\n\\nGRADE: + C\"} }\n\nevent: content_block_stop\ndata: {\"type\":\"content_block_stop\",\"index\":0 + \ }\n\nevent: message_delta\ndata: {\"type\":\"message_delta\",\"delta\":{\"stop_reason\":\"end_turn\",\"stop_sequence\":null},\"usage\":{\"input_tokens\":259,\"cache_creation_input_tokens\":0,\"cache_read_input_tokens\":0,\"output_tokens\":201} + \ }\n\nevent: message_stop\ndata: {\"type\":\"message_stop\" }\n\n" + headers: + CF-RAY: + - 9b9cf189adb07c34-DEN + Cache-Control: + - no-cache + Connection: + - keep-alive + Content-Type: + - text/event-stream; charset=utf-8 + Date: + - Tue, 06 Jan 2026 17:30:29 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Robots-Tag: + - none + anthropic-ratelimit-input-tokens-limit: + - '4000000' + anthropic-ratelimit-input-tokens-remaining: + - '4000000' + anthropic-ratelimit-input-tokens-reset: + - '2026-01-06T17:30:28Z' + anthropic-ratelimit-output-tokens-limit: + - '800000' + anthropic-ratelimit-output-tokens-remaining: + - '800000' + anthropic-ratelimit-output-tokens-reset: + - '2026-01-06T17:30:28Z' + anthropic-ratelimit-requests-limit: + - '4000' + anthropic-ratelimit-requests-remaining: + - '3999' + anthropic-ratelimit-requests-reset: + - '2026-01-06T17:30:28Z' + anthropic-ratelimit-tokens-limit: + - '4800000' + anthropic-ratelimit-tokens-remaining: + - '4800000' + anthropic-ratelimit-tokens-reset: + - '2026-01-06T17:30:28Z' + cf-cache-status: + - DYNAMIC + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-envoy-upstream-service-time: + - '290' + status: + code: 200 + message: OK +version: 1 diff --git a/tests/test_inspect.py b/tests/test_inspect.py index 95fc6e84..c2c980fb 100644 --- a/tests/test_inspect.py +++ b/tests/test_inspect.py @@ -1,8 +1,10 @@ import datetime +import json import sys from unittest.mock import patch import pytest +from pydantic import BaseModel from chatlas import AssistantTurn, Chat, ChatAnthropic, ContentToolRequest, Turn, UserTurn from chatlas._content import ( @@ -349,6 +351,44 @@ def get_current_date(): accuracy = results.scores[0].metrics["accuracy"].value assert accuracy == 1, f"Expected accuracy of 1, but got {accuracy}" + @pytest.mark.vcr + def test_structured_output(self): + """Test that to_solver() works with data_model for structured output.""" + + class Person(BaseModel): + name: str + age: int + + chat = chat_func( + system_prompt="Extract person information from the text. Return only the structured data." + ) + + # Create task with data_model parameter + task = Task( + dataset=[ + Sample( + input="John Smith is 42 years old.", + target='{"name": "John Smith", "age": 42}', + ) + ], + solver=chat.to_solver(data_model=Person), + scorer=model_graded_qa(model=SCORER_MODEL), + ) + + log = inspect_eval(task)[0] + results = log.results + + assert results is not None + assert log.samples is not None + + # Verify the output is valid JSON matching our model + sample = log.samples[0] + assert sample.output.completion is not None + completion = sample.output.completion + parsed = json.loads(completion) + assert parsed["name"] == "John Smith" + assert parsed["age"] == 42 + # Skip VCR for multi-sample tests - response ordering with VCR is unreliable # when body matching is disabled (required due to dynamic IDs in requests) @pytest.mark.skipif(