diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml index faa185c9e8..6c5e28ddc4 100644 --- a/client-sdks/stainless/openapi.yml +++ b/client-sdks/stainless/openapi.yml @@ -8476,6 +8476,13 @@ components: sequence_number: title: Sequence Number type: integer + logprobs: + anyOf: + - items: + $ref: '#/components/schemas/OpenAITokenLogProb' + type: array + - type: 'null' + nullable: true type: const: response.output_text.done default: response.output_text.done diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml index 5441f89128..7a26089a36 100644 --- a/docs/static/deprecated-llama-stack-spec.yaml +++ b/docs/static/deprecated-llama-stack-spec.yaml @@ -5177,6 +5177,13 @@ components: sequence_number: title: Sequence Number type: integer + logprobs: + anyOf: + - items: + $ref: '#/components/schemas/OpenAITokenLogProb' + type: array + - type: 'null' + nullable: true type: const: response.output_text.done default: response.output_text.done diff --git a/docs/static/experimental-llama-stack-spec.yaml b/docs/static/experimental-llama-stack-spec.yaml index 6ddb867eb2..1d3096e32d 100644 --- a/docs/static/experimental-llama-stack-spec.yaml +++ b/docs/static/experimental-llama-stack-spec.yaml @@ -5358,6 +5358,13 @@ components: sequence_number: title: Sequence Number type: integer + logprobs: + anyOf: + - items: + $ref: '#/components/schemas/OpenAITokenLogProb' + type: array + - type: 'null' + nullable: true type: const: response.output_text.done default: response.output_text.done diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index 62fb33c873..1ec7ebc267 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -7361,6 +7361,13 @@ components: sequence_number: title: Sequence Number type: integer + logprobs: + anyOf: + - items: + $ref: '#/components/schemas/OpenAITokenLogProb' + type: array + - type: 'null' + nullable: true type: const: response.output_text.done default: response.output_text.done diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index faa185c9e8..6c5e28ddc4 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -8476,6 +8476,13 @@ components: sequence_number: title: Sequence Number type: integer + logprobs: + anyOf: + - items: + $ref: '#/components/schemas/OpenAITokenLogProb' + type: array + - type: 'null' + nullable: true type: const: response.output_text.done default: response.output_text.done diff --git a/src/llama_stack/providers/inline/responses/builtin/responses/streaming.py b/src/llama_stack/providers/inline/responses/builtin/responses/streaming.py index 54a28b9bd5..aed7459738 100644 --- a/src/llama_stack/providers/inline/responses/builtin/responses/streaming.py +++ b/src/llama_stack/providers/inline/responses/builtin/responses/streaming.py @@ -74,6 +74,7 @@ OpenAIResponseObjectStreamResponseOutputItemAdded, OpenAIResponseObjectStreamResponseOutputItemDone, OpenAIResponseObjectStreamResponseOutputTextDelta, + OpenAIResponseObjectStreamResponseOutputTextDone, OpenAIResponseObjectStreamResponseReasoningTextDelta, OpenAIResponseObjectStreamResponseReasoningTextDone, OpenAIResponseObjectStreamResponseRefusalDelta, @@ -1145,9 +1146,19 @@ async def _process_streaming_chunks( sequence_number=self.sequence_number, ) - # Emit content_part.done event if text content was streamed (before content gets cleared) + # Emit output_text.done and content_part.done events if text content was streamed if content_part_emitted: final_text = "".join(chat_response_content) + # Emit output_text.done with the final accumulated text (per OpenAI protocol) + self.sequence_number += 1 + yield OpenAIResponseObjectStreamResponseOutputTextDone( + content_index=content_index, + text=final_text, + item_id=message_item_id, + output_index=message_output_index, + sequence_number=self.sequence_number, + logprobs=chat_response_logprobs if chat_response_logprobs else [], + ) self.sequence_number += 1 yield OpenAIResponseObjectStreamResponseContentPartDone( content_index=content_index, diff --git a/src/llama_stack_api/openai_responses.py b/src/llama_stack_api/openai_responses.py index 1797fe0310..173e30b0ea 100644 --- a/src/llama_stack_api/openai_responses.py +++ b/src/llama_stack_api/openai_responses.py @@ -928,6 +928,7 @@ class OpenAIResponseObjectStreamResponseOutputTextDone(BaseModel): :param item_id: Unique identifier of the completed output item :param output_index: Index position of the item in the output list :param sequence_number: Sequential number for ordering streaming events + :param logprobs: Token log probability details for the completed text :param type: Event type identifier, always "response.output_text.done" """ @@ -936,6 +937,7 @@ class OpenAIResponseObjectStreamResponseOutputTextDone(BaseModel): item_id: str output_index: int sequence_number: int + logprobs: list[OpenAITokenLogProb] | None = None type: Literal["response.output_text.done"] = "response.output_text.done" diff --git a/tests/unit/providers/responses/builtin/test_openai_responses.py b/tests/unit/providers/responses/builtin/test_openai_responses.py index 979ae114e1..e64d66d36b 100644 --- a/tests/unit/providers/responses/builtin/test_openai_responses.py +++ b/tests/unit/providers/responses/builtin/test_openai_responses.py @@ -241,7 +241,8 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m ) # Should have content part events for text streaming - # Expected: response.created, response.in_progress, content_part.added, output_text.delta, content_part.done, response.completed + # Expected: response.created, response.in_progress, output_item.added, content_part.added, + # output_text.delta, output_text.done, content_part.done, output_item.done, response.completed assert len(chunks) >= 5 assert chunks[0].type == "response.created" assert any(chunk.type == "response.in_progress" for chunk in chunks) @@ -250,10 +251,12 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m content_part_added_events = [c for c in chunks if c.type == "response.content_part.added"] content_part_done_events = [c for c in chunks if c.type == "response.content_part.done"] text_delta_events = [c for c in chunks if c.type == "response.output_text.delta"] + text_done_events = [c for c in chunks if c.type == "response.output_text.done"] assert len(content_part_added_events) >= 1, "Should have content_part.added event for text" assert len(content_part_done_events) >= 1, "Should have content_part.done event for text" assert len(text_delta_events) >= 1, "Should have text delta events" + assert len(text_done_events) >= 1, "Should have output_text.done event with final accumulated text" added_event = content_part_added_events[0] done_event = content_part_done_events[0] @@ -263,6 +266,20 @@ async def test_create_openai_response_with_string_input(openai_responses_impl, m assert added_event.item_id == done_event.item_id assert added_event.response_id == done_event.response_id + # Verify output_text.done contains the final accumulated text and correct indices + text_done_event = text_done_events[0] + assert text_done_event.content_index == 0 + assert text_done_event.output_index == 0 + assert text_done_event.item_id == added_event.item_id + assert isinstance(text_done_event.text, str) + assert len(text_done_event.text) > 0, "output_text.done should contain the final text" + + # Verify output_text.done comes before content_part.done (per OpenAI protocol) + chunk_types = [c.type for c in chunks] + text_done_idx = chunk_types.index("response.output_text.done") + content_done_idx = chunk_types.index("response.content_part.done") + assert text_done_idx < content_done_idx, "output_text.done must precede content_part.done" + # Verify final event is completion assert chunks[-1].type == "response.completed"