From ebfa8ad4fbdcdee4c452667f92bb723d8e151324 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Tue, 28 Jan 2025 12:27:21 -0800
Subject: [PATCH 1/5] Update OpenAPI generator to add param and field
 documentation

---
 docs/openapi_generator/generate.py            |  13 +-
 docs/openapi_generator/pyopenapi/generator.py |  34 +-
 .../openapi_generator/strong_typing/schema.py |   1 +
 docs/resources/llama-stack-spec.html          | 380 ++++++++++--------
 docs/resources/llama-stack-spec.yaml          | 364 ++++++++++-------
 .../apis/batch_inference/batch_inference.py   |  33 +-
 llama_stack/apis/inference/inference.py       |  81 ++--
 7 files changed, 517 insertions(+), 389 deletions(-)

diff --git a/docs/openapi_generator/generate.py b/docs/openapi_generator/generate.py
index 1a59369cbb..48109e5d87 100644
--- a/docs/openapi_generator/generate.py
+++ b/docs/openapi_generator/generate.py
@@ -36,6 +36,16 @@
 from .pyopenapi.utility import Specification  # noqa: E402
 
 
+def str_presenter(dumper, data):
+    if data.startswith(f"/{LLAMA_STACK_API_VERSION}") or data.startswith(
+        "#/components/schemas/"
+    ):
+        style = None
+    else:
+        style = ">" if "\n" in data or len(data) > 40 else None
+    return dumper.represent_scalar("tag:yaml.org,2002:str", data, style=style)
+
+
 def main(output_dir: str):
     output_dir = Path(output_dir)
     if not output_dir.exists():
@@ -69,7 +79,8 @@ def main(output_dir: str):
         y.sequence_dash_offset = 2
         y.width = 80
         y.allow_unicode = True
-        y.explicit_start = True
+        y.representer.add_representer(str, str_presenter)
+
         y.dump(
             spec.get_json(),
             fp,
diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index 317b895b5a..d8e0d81ed1 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -8,6 +8,7 @@
 import hashlib
 import ipaddress
 import typing
+from dataclasses import field, make_dataclass
 from typing import Any, Dict, Set, Union
 
 from ..strong_typing.core import JsonType
@@ -276,6 +277,20 @@ class StatusResponse:
     examples: List[Any] = dataclasses.field(default_factory=list)
 
 
+def create_docstring_for_request(
+    request_name: str, fields: List[Tuple[str, type, Any]], doc_params: Dict[str, str]
+) -> str:
+    """Creates a ReST-style docstring for a dynamically generated request dataclass."""
+    lines = ["\n"]  # Short description
+
+    # Add parameter documentation in ReST format
+    for name, type_ in fields:
+        desc = doc_params.get(name, "")
+        lines.append(f":param {name}: {desc}")
+
+    return "\n".join(lines)
+
+
 class ResponseBuilder:
     content_builder: ContentBuilder
 
@@ -493,11 +508,24 @@ def _build_operation(self, op: EndpointOperation) -> Operation:
             first = next(iter(op.request_params))
             request_name, request_type = first
 
-            from dataclasses import make_dataclass
-
             op_name = "".join(word.capitalize() for word in op.name.split("_"))
             request_name = f"{op_name}Request"
-            request_type = make_dataclass(request_name, op.request_params)
+            fields = [
+                (
+                    name,
+                    type_,
+                )
+                for name, type_ in op.request_params
+            ]
+            request_type = make_dataclass(
+                request_name,
+                fields,
+                namespace={
+                    "__doc__": create_docstring_for_request(
+                        request_name, fields, doc_params
+                    )
+                },
+            )
 
             requestBody = RequestBody(
                 content={
diff --git a/docs/openapi_generator/strong_typing/schema.py b/docs/openapi_generator/strong_typing/schema.py
index 826efdb4a7..f4393041ff 100644
--- a/docs/openapi_generator/strong_typing/schema.py
+++ b/docs/openapi_generator/strong_typing/schema.py
@@ -531,6 +531,7 @@ def type_to_schema(self, data_type: TypeLike, force_expand: bool = False) -> Sch
             # add property docstring if available
             property_doc = property_docstrings.get(property_name)
             if property_doc:
+                # print(output_name, property_doc)
                 property_def.pop("title", None)
                 property_def["description"] = property_doc
 
diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html
index 5998963d20..b720bef21e 100644
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@@ -190,7 +190,7 @@
             "post": {
                 "responses": {
                     "200": {
-                        "description": "Chat completion response. **OR** SSE-stream of these events.",
+                        "description": "If stream=False, returns a ChatCompletionResponse with the full completion. If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk",
                         "content": {
                             "text/event-stream": {
                                 "schema": {
@@ -210,6 +210,7 @@
                 "tags": [
                     "Inference"
                 ],
+                "summary": "Generate a chat completion for the given messages using the specified model.",
                 "parameters": [],
                 "requestBody": {
                     "content": {
@@ -227,7 +228,7 @@
             "post": {
                 "responses": {
                     "200": {
-                        "description": "Completion response. **OR** streamed completion response.",
+                        "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk",
                         "content": {
                             "text/event-stream": {
                                 "schema": {
@@ -247,6 +248,7 @@
                 "tags": [
                     "Inference"
                 ],
+                "summary": "Generate a completion for the given content using the specified model.",
                 "parameters": [],
                 "requestBody": {
                     "content": {
@@ -485,7 +487,7 @@
             "post": {
                 "responses": {
                     "200": {
-                        "description": "OK",
+                        "description": "An array of embeddings, one for each content. Each embedding is a list of floats.",
                         "content": {
                             "application/json": {
                                 "schema": {
@@ -498,6 +500,7 @@
                 "tags": [
                     "Inference"
                 ],
+                "summary": "Generate embeddings for content pieces using the specified model.",
                 "parameters": [],
                 "requestBody": {
                     "content": {
@@ -2372,6 +2375,46 @@
                     "tool_calls"
                 ]
             },
+            "GrammarResponseFormat": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "grammar",
+                        "default": "grammar"
+                    },
+                    "bnf": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "bnf"
+                ]
+            },
             "GreedySamplingStrategy": {
                 "type": "object",
                 "properties": {
@@ -2447,6 +2490,46 @@
                     }
                 }
             },
+            "JsonSchemaResponseFormat": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "const": "json_schema",
+                        "default": "json_schema"
+                    },
+                    "json_schema": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "oneOf": [
+                                {
+                                    "type": "null"
+                                },
+                                {
+                                    "type": "boolean"
+                                },
+                                {
+                                    "type": "number"
+                                },
+                                {
+                                    "type": "string"
+                                },
+                                {
+                                    "type": "array"
+                                },
+                                {
+                                    "type": "object"
+                                }
+                            ]
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "type",
+                    "json_schema"
+                ]
+            },
             "Message": {
                 "oneOf": [
                     {
@@ -2472,6 +2555,23 @@
                     }
                 }
             },
+            "ResponseFormat": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/JsonSchemaResponseFormat"
+                    },
+                    {
+                        "$ref": "#/components/schemas/GrammarResponseFormat"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
+                        "grammar": "#/components/schemas/GrammarResponseFormat"
+                    }
+                }
+            },
             "SamplingParams": {
                 "type": "object",
                 "properties": {
@@ -2865,6 +2965,9 @@
                     "tool_prompt_format": {
                         "$ref": "#/components/schemas/ToolPromptFormat"
                     },
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat"
+                    },
                     "logprobs": {
                         "type": "object",
                         "properties": {
@@ -2885,16 +2988,49 @@
             "BatchChatCompletionResponse": {
                 "type": "object",
                 "properties": {
-                    "completion_message_batch": {
+                    "batch": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/ChatCompletionResponse"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "batch"
+                ]
+            },
+            "ChatCompletionResponse": {
+                "type": "object",
+                "properties": {
+                    "completion_message": {
+                        "$ref": "#/components/schemas/CompletionMessage"
+                    },
+                    "logprobs": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/CompletionMessage"
+                            "$ref": "#/components/schemas/TokenLogProbs"
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "completion_message_batch"
+                    "completion_message"
+                ]
+            },
+            "TokenLogProbs": {
+                "type": "object",
+                "properties": {
+                    "logprobs_by_token": {
+                        "type": "object",
+                        "additionalProperties": {
+                            "type": "number"
+                        }
+                    }
+                },
+                "additionalProperties": false,
+                "required": [
+                    "logprobs_by_token"
                 ]
             },
             "BatchCompletionRequest": {
@@ -2912,6 +3048,9 @@
                     "sampling_params": {
                         "$ref": "#/components/schemas/SamplingParams"
                     },
+                    "response_format": {
+                        "$ref": "#/components/schemas/ResponseFormat"
+                    },
                     "logprobs": {
                         "type": "object",
                         "properties": {
@@ -2932,159 +3071,93 @@
             "BatchCompletionResponse": {
                 "type": "object",
                 "properties": {
-                    "completion_message_batch": {
+                    "batch": {
                         "type": "array",
                         "items": {
-                            "$ref": "#/components/schemas/CompletionMessage"
+                            "$ref": "#/components/schemas/CompletionResponse"
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "completion_message_batch"
+                    "batch"
                 ]
             },
-            "CancelTrainingJobRequest": {
+            "CompletionResponse": {
                 "type": "object",
                 "properties": {
-                    "job_uuid": {
+                    "content": {
                         "type": "string"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "job_uuid"
-                ]
-            },
-            "GrammarResponseFormat": {
-                "type": "object",
-                "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "grammar",
-                        "default": "grammar"
                     },
-                    "bnf": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
+                    "stop_reason": {
+                        "$ref": "#/components/schemas/StopReason"
+                    },
+                    "logprobs": {
+                        "type": "array",
+                        "items": {
+                            "$ref": "#/components/schemas/TokenLogProbs"
                         }
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "bnf"
-                ]
+                    "content",
+                    "stop_reason"
+                ],
+                "title": "Completion response."
             },
-            "JsonSchemaResponseFormat": {
+            "CancelTrainingJobRequest": {
                 "type": "object",
                 "properties": {
-                    "type": {
-                        "type": "string",
-                        "const": "json_schema",
-                        "default": "json_schema"
-                    },
-                    "json_schema": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "oneOf": [
-                                {
-                                    "type": "null"
-                                },
-                                {
-                                    "type": "boolean"
-                                },
-                                {
-                                    "type": "number"
-                                },
-                                {
-                                    "type": "string"
-                                },
-                                {
-                                    "type": "array"
-                                },
-                                {
-                                    "type": "object"
-                                }
-                            ]
-                        }
+                    "job_uuid": {
+                        "type": "string"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
-                    "type",
-                    "json_schema"
+                    "job_uuid"
                 ]
             },
-            "ResponseFormat": {
-                "oneOf": [
-                    {
-                        "$ref": "#/components/schemas/JsonSchemaResponseFormat"
-                    },
-                    {
-                        "$ref": "#/components/schemas/GrammarResponseFormat"
-                    }
-                ],
-                "discriminator": {
-                    "propertyName": "type",
-                    "mapping": {
-                        "json_schema": "#/components/schemas/JsonSchemaResponseFormat",
-                        "grammar": "#/components/schemas/GrammarResponseFormat"
-                    }
-                }
-            },
             "ChatCompletionRequest": {
                 "type": "object",
                 "properties": {
                     "model_id": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The identifier of the model to use"
                     },
                     "messages": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/Message"
-                        }
+                        },
+                        "description": "List of messages in the conversation"
                     },
                     "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
+                        "$ref": "#/components/schemas/SamplingParams",
+                        "description": "Parameters to control the sampling strategy"
                     },
                     "tools": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/ToolDefinition"
-                        }
+                        },
+                        "description": "(Optional) List of tool definitions available to the model"
                     },
                     "tool_choice": {
-                        "$ref": "#/components/schemas/ToolChoice"
+                        "$ref": "#/components/schemas/ToolChoice",
+                        "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto."
                     },
                     "tool_prompt_format": {
-                        "$ref": "#/components/schemas/ToolPromptFormat"
+                        "$ref": "#/components/schemas/ToolPromptFormat",
+                        "description": "(Optional) Specifies how tool definitions are formatted when presenting to the model"
                     },
                     "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat"
+                        "$ref": "#/components/schemas/ResponseFormat",
+                        "description": "(Optional) Grammar specification for guided (structured) decoding"
                     },
                     "stream": {
-                        "type": "boolean"
+                        "type": "boolean",
+                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
                     },
                     "logprobs": {
                         "type": "object",
@@ -3094,7 +3167,8 @@
                                 "default": 0
                             }
                         },
-                        "additionalProperties": false
+                        "additionalProperties": false,
+                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
                     }
                 },
                 "additionalProperties": false,
@@ -3103,25 +3177,6 @@
                     "messages"
                 ]
             },
-            "ChatCompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "completion_message": {
-                        "$ref": "#/components/schemas/CompletionMessage"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "completion_message"
-                ],
-                "title": "Chat completion response."
-            },
             "ChatCompletionResponseEvent": {
                 "type": "object",
                 "properties": {
@@ -3166,8 +3221,7 @@
                 "additionalProperties": false,
                 "required": [
                     "event"
-                ],
-                "title": "SSE-stream of these events."
+                ]
             },
             "ContentDelta": {
                 "oneOf": [
@@ -3227,21 +3281,6 @@
                     "text"
                 ]
             },
-            "TokenLogProbs": {
-                "type": "object",
-                "properties": {
-                    "logprobs_by_token": {
-                        "type": "object",
-                        "additionalProperties": {
-                            "type": "number"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "logprobs_by_token"
-                ]
-            },
             "ToolCallDelta": {
                 "type": "object",
                 "properties": {
@@ -3284,19 +3323,24 @@
                 "type": "object",
                 "properties": {
                     "model_id": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The identifier of the model to use"
                     },
                     "content": {
-                        "$ref": "#/components/schemas/InterleavedContent"
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content to generate a completion for"
                     },
                     "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams"
+                        "$ref": "#/components/schemas/SamplingParams",
+                        "description": "(Optional) Parameters to control the sampling strategy"
                     },
                     "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat"
+                        "$ref": "#/components/schemas/ResponseFormat",
+                        "description": "(Optional) Grammar specification for guided (structured) decoding"
                     },
                     "stream": {
-                        "type": "boolean"
+                        "type": "boolean",
+                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
                     },
                     "logprobs": {
                         "type": "object",
@@ -3306,7 +3350,8 @@
                                 "default": 0
                             }
                         },
-                        "additionalProperties": false
+                        "additionalProperties": false,
+                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
                     }
                 },
                 "additionalProperties": false,
@@ -3315,29 +3360,6 @@
                     "content"
                 ]
             },
-            "CompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "content": {
-                        "type": "string"
-                    },
-                    "stop_reason": {
-                        "$ref": "#/components/schemas/StopReason"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        }
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "content",
-                    "stop_reason"
-                ],
-                "title": "Completion response."
-            },
             "CompletionResponseStreamChunk": {
                 "type": "object",
                 "properties": {
@@ -4241,13 +4263,15 @@
                 "type": "object",
                 "properties": {
                     "model_id": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The identifier of the model to use"
                     },
                     "contents": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/InterleavedContent"
-                        }
+                        },
+                        "description": "List of contents to generate embeddings for. Note that content can be multimodal."
                     }
                 },
                 "additionalProperties": false,
@@ -7863,7 +7887,7 @@
         },
         {
             "name": "ChatCompletionResponse",
-            "description": "Chat completion response."
+            "description": ""
         },
         {
             "name": "ChatCompletionResponseEvent",
@@ -7875,7 +7899,7 @@
         },
         {
             "name": "ChatCompletionResponseStreamChunk",
-            "description": "SSE-stream of these events."
+            "description": ""
         },
         {
             "name": "Checkpoint",
diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml
index 1d7c4f113b..353d99d00a 100644
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@@ -1,11 +1,12 @@
----
 openapi: 3.1.0
 info:
   title: Llama Stack Specification
   version: v1
-  description: "This is the specification of the Llama Stack that provides\n     \
-    \           a set of endpoints and their corresponding interfaces that are tailored
-    to\n                best leverage Llama Models."
+  description: >-
+    This is the specification of the Llama Stack that provides
+                    a set of endpoints and their corresponding interfaces that are
+    tailored to
+                    best leverage Llama Models.
 servers:
   - url: http://any-hosted-llama-stack.com
 paths:
@@ -108,7 +109,9 @@ paths:
     post:
       responses:
         '200':
-          description: Chat completion response. **OR** SSE-stream of these events.
+          description: >-
+            If stream=False, returns a ChatCompletionResponse with the full completion.
+            If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
           content:
             text/event-stream:
               schema:
@@ -117,6 +120,8 @@ paths:
                   - $ref: '#/components/schemas/ChatCompletionResponseStreamChunk'
       tags:
         - Inference
+      summary: >-
+        Generate a chat completion for the given messages using the specified model.
       parameters: []
       requestBody:
         content:
@@ -128,7 +133,9 @@ paths:
     post:
       responses:
         '200':
-          description: Completion response. **OR** streamed completion response.
+          description: >-
+            If stream=False, returns a CompletionResponse with the full completion.
+            If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
           content:
             text/event-stream:
               schema:
@@ -137,6 +144,8 @@ paths:
                   - $ref: '#/components/schemas/CompletionResponseStreamChunk'
       tags:
         - Inference
+      summary: >-
+        Generate a completion for the given content using the specified model.
       parameters: []
       requestBody:
         content:
@@ -189,8 +198,9 @@ paths:
     post:
       responses:
         '200':
-          description: A single turn in an interaction with an Agentic System. **OR**
-            streamed agent turn completion response.
+          description: >-
+            A single turn in an interaction with an Agentic System. **OR** streamed
+            agent turn completion response.
           content:
             text/event-stream:
               schema:
@@ -279,13 +289,17 @@ paths:
     post:
       responses:
         '200':
-          description: OK
+          description: >-
+            An array of embeddings, one for each content. Each embedding is a list
+            of floats.
           content:
             application/json:
               schema:
                 $ref: '#/components/schemas/EmbeddingsResponse'
       tags:
         - Inference
+      summary: >-
+        Generate embeddings for content pieces using the specified model.
       parameters: []
       requestBody:
         content:
@@ -709,7 +723,8 @@ paths:
           description: OK
       tags:
         - ToolRuntime
-      summary: Index documents so they can be used by the RAG system
+      summary: >-
+        Index documents so they can be used by the RAG system
       parameters: []
       requestBody:
         content:
@@ -1109,7 +1124,8 @@ paths:
                 $ref: '#/components/schemas/RAGQueryResult'
       tags:
         - ToolRuntime
-      summary: Query the RAG system for context; typically invoked by the agent
+      summary: >-
+        Query the RAG system for context; typically invoked by the agent
       parameters: []
       requestBody:
         content:
@@ -1341,7 +1357,8 @@ paths:
       tags:
         - Inspect
       parameters: []
-jsonSchemaDialect: https://json-schema.org/draft/2020-12/schema
+jsonSchemaDialect: >-
+  https://json-schema.org/draft/2020-12/schema
 components:
   schemas:
     AppendRowsRequest:
@@ -1393,6 +1410,27 @@ components:
         - content
         - stop_reason
         - tool_calls
+    GrammarResponseFormat:
+      type: object
+      properties:
+        type:
+          type: string
+          const: grammar
+          default: grammar
+        bnf:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - type
+        - bnf
     GreedySamplingStrategy:
       type: object
       properties:
@@ -1439,6 +1477,27 @@ components:
         mapping:
           image: '#/components/schemas/ImageContentItem'
           text: '#/components/schemas/TextContentItem'
+    JsonSchemaResponseFormat:
+      type: object
+      properties:
+        type:
+          type: string
+          const: json_schema
+          default: json_schema
+        json_schema:
+          type: object
+          additionalProperties:
+            oneOf:
+              - type: 'null'
+              - type: boolean
+              - type: number
+              - type: string
+              - type: array
+              - type: object
+      additionalProperties: false
+      required:
+        - type
+        - json_schema
     Message:
       oneOf:
         - $ref: '#/components/schemas/UserMessage'
@@ -1452,6 +1511,15 @@ components:
           system: '#/components/schemas/SystemMessage'
           tool: '#/components/schemas/ToolResponseMessage'
           assistant: '#/components/schemas/CompletionMessage'
+    ResponseFormat:
+      oneOf:
+        - $ref: '#/components/schemas/JsonSchemaResponseFormat'
+        - $ref: '#/components/schemas/GrammarResponseFormat'
+      discriminator:
+        propertyName: type
+        mapping:
+          json_schema: '#/components/schemas/JsonSchemaResponseFormat'
+          grammar: '#/components/schemas/GrammarResponseFormat'
     SamplingParams:
       type: object
       properties:
@@ -1594,16 +1662,28 @@ components:
         - json
         - function_tag
         - python_list
-      title: This Enum refers to the prompt format for calling custom / zero shot
-        tools
-      description: "`json` --\n    Refers to the json format for calling tools.\n\
-        \    The json format takes the form like\n    {\n        \"type\": \"function\"\
-        ,\n        \"function\" : {\n            \"name\": \"function_name\",\n  \
-        \          \"description\": \"function_description\",\n            \"parameters\"\
-        : {...}\n        }\n    }\n\n`function_tag` --\n    This is an example of
-        how you could define\n    your own user defined format for making tool calls.\n\
-        \    The function_tag format looks like this,\n    <function=function_name>(parameters)</function>\n
-        \nThe detailed prompts for each of these formats are added to llama cli"
+      title: >-
+        This Enum refers to the prompt format for calling custom / zero shot tools
+      description: >-
+        `json` --
+            Refers to the json format for calling tools.
+            The json format takes the form like
+            {
+                "type": "function",
+                "function" : {
+                    "name": "function_name",
+                    "description": "function_description",
+                    "parameters": {...}
+                }
+            }
+
+        `function_tag` --
+            This is an example of how you could define
+            your own user defined format for making tool calls.
+            The function_tag format looks like this,
+            <function=function_name>(parameters)</function>
+
+        The detailed prompts for each of these formats are added to llama cli
     ToolResponseMessage:
       type: object
       properties:
@@ -1697,6 +1777,8 @@ components:
           $ref: '#/components/schemas/ToolChoice'
         tool_prompt_format:
           $ref: '#/components/schemas/ToolPromptFormat'
+        response_format:
+          $ref: '#/components/schemas/ResponseFormat'
         logprobs:
           type: object
           properties:
@@ -1711,13 +1793,35 @@ components:
     BatchChatCompletionResponse:
       type: object
       properties:
-        completion_message_batch:
+        batch:
           type: array
           items:
-            $ref: '#/components/schemas/CompletionMessage'
+            $ref: '#/components/schemas/ChatCompletionResponse'
       additionalProperties: false
       required:
-        - completion_message_batch
+        - batch
+    ChatCompletionResponse:
+      type: object
+      properties:
+        completion_message:
+          $ref: '#/components/schemas/CompletionMessage'
+        logprobs:
+          type: array
+          items:
+            $ref: '#/components/schemas/TokenLogProbs'
+      additionalProperties: false
+      required:
+        - completion_message
+    TokenLogProbs:
+      type: object
+      properties:
+        logprobs_by_token:
+          type: object
+          additionalProperties:
+            type: number
+      additionalProperties: false
+      required:
+        - logprobs_by_token
     BatchCompletionRequest:
       type: object
       properties:
@@ -1729,6 +1833,8 @@ components:
             $ref: '#/components/schemas/InterleavedContent'
         sampling_params:
           $ref: '#/components/schemas/SamplingParams'
+        response_format:
+          $ref: '#/components/schemas/ResponseFormat'
         logprobs:
           type: object
           properties:
@@ -1743,95 +1849,76 @@ components:
     BatchCompletionResponse:
       type: object
       properties:
-        completion_message_batch:
+        batch:
           type: array
           items:
-            $ref: '#/components/schemas/CompletionMessage'
-      additionalProperties: false
-      required:
-        - completion_message_batch
-    CancelTrainingJobRequest:
-      type: object
-      properties:
-        job_uuid:
-          type: string
+            $ref: '#/components/schemas/CompletionResponse'
       additionalProperties: false
       required:
-        - job_uuid
-    GrammarResponseFormat:
+        - batch
+    CompletionResponse:
       type: object
       properties:
-        type:
+        content:
           type: string
-          const: grammar
-          default: grammar
-        bnf:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
+        stop_reason:
+          $ref: '#/components/schemas/StopReason'
+        logprobs:
+          type: array
+          items:
+            $ref: '#/components/schemas/TokenLogProbs'
       additionalProperties: false
       required:
-        - type
-        - bnf
-    JsonSchemaResponseFormat:
+        - content
+        - stop_reason
+      title: Completion response.
+    CancelTrainingJobRequest:
       type: object
       properties:
-        type:
+        job_uuid:
           type: string
-          const: json_schema
-          default: json_schema
-        json_schema:
-          type: object
-          additionalProperties:
-            oneOf:
-              - type: 'null'
-              - type: boolean
-              - type: number
-              - type: string
-              - type: array
-              - type: object
       additionalProperties: false
       required:
-        - type
-        - json_schema
-    ResponseFormat:
-      oneOf:
-        - $ref: '#/components/schemas/JsonSchemaResponseFormat'
-        - $ref: '#/components/schemas/GrammarResponseFormat'
-      discriminator:
-        propertyName: type
-        mapping:
-          json_schema: '#/components/schemas/JsonSchemaResponseFormat'
-          grammar: '#/components/schemas/GrammarResponseFormat'
+        - job_uuid
     ChatCompletionRequest:
       type: object
       properties:
         model_id:
           type: string
+          description: The identifier of the model to use
         messages:
           type: array
           items:
             $ref: '#/components/schemas/Message'
+          description: List of messages in the conversation
         sampling_params:
           $ref: '#/components/schemas/SamplingParams'
+          description: >-
+            Parameters to control the sampling strategy
         tools:
           type: array
           items:
             $ref: '#/components/schemas/ToolDefinition'
+          description: >-
+            (Optional) List of tool definitions available to the model
         tool_choice:
           $ref: '#/components/schemas/ToolChoice'
+          description: >-
+            (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
         tool_prompt_format:
           $ref: '#/components/schemas/ToolPromptFormat'
+          description: >-
+            (Optional) Specifies how tool definitions are formatted when presenting
+            to the model
         response_format:
           $ref: '#/components/schemas/ResponseFormat'
+          description: >-
+            (Optional) Grammar specification for guided (structured) decoding
         stream:
           type: boolean
+          description: >-
+            (Optional) If True, generate an SSE event stream of the response. Defaults
+            to False.
         logprobs:
           type: object
           properties:
@@ -1839,23 +1926,13 @@ components:
               type: integer
               default: 0
           additionalProperties: false
+          description: >-
+            (Optional) If specified, log probabilities for each token position will
+            be returned.
       additionalProperties: false
       required:
         - model_id
         - messages
-    ChatCompletionResponse:
-      type: object
-      properties:
-        completion_message:
-          $ref: '#/components/schemas/CompletionMessage'
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-      additionalProperties: false
-      required:
-        - completion_message
-      title: Chat completion response.
     ChatCompletionResponseEvent:
       type: object
       properties:
@@ -1888,7 +1965,6 @@ components:
       additionalProperties: false
       required:
         - event
-      title: SSE-stream of these events.
     ContentDelta:
       oneOf:
         - $ref: '#/components/schemas/TextDelta'
@@ -1927,16 +2003,6 @@ components:
       required:
         - type
         - text
-    TokenLogProbs:
-      type: object
-      properties:
-        logprobs_by_token:
-          type: object
-          additionalProperties:
-            type: number
-      additionalProperties: false
-      required:
-        - logprobs_by_token
     ToolCallDelta:
       type: object
       properties:
@@ -1967,14 +2033,23 @@ components:
       properties:
         model_id:
           type: string
+          description: The identifier of the model to use
         content:
           $ref: '#/components/schemas/InterleavedContent'
+          description: The content to generate a completion for
         sampling_params:
           $ref: '#/components/schemas/SamplingParams'
+          description: >-
+            (Optional) Parameters to control the sampling strategy
         response_format:
           $ref: '#/components/schemas/ResponseFormat'
+          description: >-
+            (Optional) Grammar specification for guided (structured) decoding
         stream:
           type: boolean
+          description: >-
+            (Optional) If True, generate an SSE event stream of the response. Defaults
+            to False.
         logprobs:
           type: object
           properties:
@@ -1982,26 +2057,13 @@ components:
               type: integer
               default: 0
           additionalProperties: false
+          description: >-
+            (Optional) If specified, log probabilities for each token position will
+            be returned.
       additionalProperties: false
       required:
         - model_id
         - content
-    CompletionResponse:
-      type: object
-      properties:
-        content:
-          type: string
-        stop_reason:
-          $ref: '#/components/schemas/StopReason'
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-      additionalProperties: false
-      required:
-        - content
-        - stop_reason
-      title: Completion response.
     CompletionResponseStreamChunk:
       type: object
       properties:
@@ -2558,7 +2620,8 @@ components:
         - output_message
         - output_attachments
         - started_at
-      title: A single turn in an interaction with an Agentic System.
+      title: >-
+        A single turn in an interaction with an Agentic System.
     ViolationLevel:
       type: string
       enum:
@@ -2570,10 +2633,14 @@ components:
       properties:
         model_id:
           type: string
+          description: The identifier of the model to use
         contents:
           type: array
           items:
             $ref: '#/components/schemas/InterleavedContent'
+          description: >-
+            List of contents to generate embeddings for. Note that content can be
+            multimodal.
       additionalProperties: false
       required:
         - model_id
@@ -2845,7 +2912,8 @@ components:
         - session_name
         - turns
         - started_at
-      title: A single session of an interaction with an Agentic System.
+      title: >-
+        A single session of an interaction with an Agentic System.
     AgentStepResponse:
       type: object
       properties:
@@ -3194,7 +3262,8 @@ components:
         - provider_resource_id
         - provider_id
         - type
-      title: A safety shield resource that can be used to check content
+      title: >-
+        A safety shield resource that can be used to check content
     Span:
       type: object
       properties:
@@ -4684,8 +4753,9 @@ components:
       additionalProperties: false
       required:
         - synthetic_data
-      title: Response from the synthetic data generation. Batch of (prompt, response,
-        score) tuples that pass the threshold.
+      title: >-
+        Response from the synthetic data generation. Batch of (prompt, response, score)
+        tuples that pass the threshold.
     VersionInfo:
       type: object
       properties:
@@ -4763,13 +4833,13 @@ tags:
   - name: ChatCompletionRequest
     description: ''
   - name: ChatCompletionResponse
-    description: Chat completion response.
+    description: ''
   - name: ChatCompletionResponseEvent
     description: Chat completion response event.
   - name: ChatCompletionResponseEventType
     description: ''
   - name: ChatCompletionResponseStreamChunk
-    description: SSE-stream of these events.
+    description: ''
   - name: Checkpoint
     description: Checkpoint created during training runs
   - name: CompletionInputType
@@ -4998,9 +5068,11 @@ tags:
   - name: ScoringResult
     description: ''
   - name: Session
-    description: A single session of an interaction with an Agentic System.
+    description: >-
+      A single session of an interaction with an Agentic System.
   - name: Shield
-    description: A safety shield resource that can be used to check content
+    description: >-
+      A safety shield resource that can be used to check content
   - name: ShieldCallStep
     description: ''
   - name: Shields
@@ -5028,8 +5100,9 @@ tags:
     description: ''
   - name: SyntheticDataGeneration (Coming Soon)
   - name: SyntheticDataGenerationResponse
-    description: Response from the synthetic data generation. Batch of (prompt, response,
-      score) tuples that pass the threshold.
+    description: >-
+      Response from the synthetic data generation. Batch of (prompt, response, score)
+      tuples that pass the threshold.
   - name: SystemMessage
     description: ''
   - name: Telemetry
@@ -5067,15 +5140,29 @@ tags:
   - name: ToolParameter
     description: ''
   - name: ToolPromptFormat
-    description: "This Enum refers to the prompt format for calling custom / zero
-      shot tools\n\n`json` --\n    Refers to the json format for calling tools.\n\
-      \    The json format takes the form like\n    {\n        \"type\": \"function\"\
-      ,\n        \"function\" : {\n            \"name\": \"function_name\",\n    \
-      \        \"description\": \"function_description\",\n            \"parameters\"\
-      : {...}\n        }\n    }\n\n`function_tag` --\n    This is an example of how
-      you could define\n    your own user defined format for making tool calls.\n\
-      \    The function_tag format looks like this,\n    <function=function_name>(parameters)</function>\n
-      \nThe detailed prompts for each of these formats are added to llama cli"
+    description: >-
+      This Enum refers to the prompt format for calling custom / zero shot tools
+
+
+      `json` --
+          Refers to the json format for calling tools.
+          The json format takes the form like
+          {
+              "type": "function",
+              "function" : {
+                  "name": "function_name",
+                  "description": "function_description",
+                  "parameters": {...}
+              }
+          }
+
+      `function_tag` --
+          This is an example of how you could define
+          your own user defined format for making tool calls.
+          The function_tag format looks like this,
+          <function=function_name>(parameters)</function>
+
+      The detailed prompts for each of these formats are added to llama cli
   - name: ToolResponse
     description: ''
   - name: ToolResponseMessage
@@ -5090,7 +5177,8 @@ tags:
   - name: TrainingConfig
     description: ''
   - name: Turn
-    description: A single turn in an interaction with an Agentic System.
+    description: >-
+      A single turn in an interaction with an Agentic System.
   - name: URL
     description: ''
   - name: UnionType
diff --git a/llama_stack/apis/batch_inference/batch_inference.py b/llama_stack/apis/batch_inference/batch_inference.py
index ca5ba059f4..413c81c5a7 100644
--- a/llama_stack/apis/batch_inference/batch_inference.py
+++ b/llama_stack/apis/batch_inference/batch_inference.py
@@ -7,13 +7,15 @@
 from typing import List, Optional, Protocol, runtime_checkable
 
 from llama_models.schema_utils import json_schema_type, webmethod
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
 
 from llama_stack.apis.inference import (
-    CompletionMessage,
+    ChatCompletionResponse,
+    CompletionResponse,
     InterleavedContent,
     LogProbConfig,
     Message,
+    ResponseFormat,
     SamplingParams,
     ToolChoice,
     ToolDefinition,
@@ -21,35 +23,14 @@
 )
 
 
-@json_schema_type
-class BatchCompletionRequest(BaseModel):
-    model: str
-    content_batch: List[InterleavedContent]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-    logprobs: Optional[LogProbConfig] = None
-
-
 @json_schema_type
 class BatchCompletionResponse(BaseModel):
-    completion_message_batch: List[CompletionMessage]
-
-
-@json_schema_type
-class BatchChatCompletionRequest(BaseModel):
-    model: str
-    messages_batch: List[List[Message]]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-
-    # zero-shot tool definitions as input to the model
-    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
-    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
-    logprobs: Optional[LogProbConfig] = None
+    batch: List[CompletionResponse]
 
 
 @json_schema_type
 class BatchChatCompletionResponse(BaseModel):
-    completion_message_batch: List[CompletionMessage]
+    batch: List[ChatCompletionResponse]
 
 
 @runtime_checkable
@@ -60,6 +41,7 @@ async def batch_completion(
         model: str,
         content_batch: List[InterleavedContent],
         sampling_params: Optional[SamplingParams] = SamplingParams(),
+        response_format: Optional[ResponseFormat] = None,
         logprobs: Optional[LogProbConfig] = None,
     ) -> BatchCompletionResponse: ...
 
@@ -73,5 +55,6 @@ async def batch_chat_completion(
         tools: Optional[List[ToolDefinition]] = list,
         tool_choice: Optional[ToolChoice] = ToolChoice.auto,
         tool_prompt_format: Optional[ToolPromptFormat] = None,
+        response_format: Optional[ResponseFormat] = None,
         logprobs: Optional[LogProbConfig] = None,
     ) -> BatchChatCompletionResponse: ...
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 871f1f6334..36f385eb27 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -186,7 +186,6 @@ class GrammarResponseFormat(BaseModel):
 )
 
 
-@json_schema_type
 class CompletionRequest(BaseModel):
     model: str
     content: InterleavedContent
@@ -215,23 +214,6 @@ class CompletionResponseStreamChunk(BaseModel):
     logprobs: Optional[List[TokenLogProbs]] = None
 
 
-@json_schema_type
-class BatchCompletionRequest(BaseModel):
-    model: str
-    content_batch: List[InterleavedContent]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-    response_format: Optional[ResponseFormat] = None
-    logprobs: Optional[LogProbConfig] = None
-
-
-@json_schema_type
-class BatchCompletionResponse(BaseModel):
-    """Batch completion response."""
-
-    batch: List[CompletionResponse]
-
-
-@json_schema_type
 class ChatCompletionRequest(BaseModel):
     model: str
     messages: List[Message]
@@ -249,37 +231,15 @@ class ChatCompletionRequest(BaseModel):
 
 @json_schema_type
 class ChatCompletionResponseStreamChunk(BaseModel):
-    """SSE-stream of these events."""
-
     event: ChatCompletionResponseEvent
 
 
 @json_schema_type
 class ChatCompletionResponse(BaseModel):
-    """Chat completion response."""
-
     completion_message: CompletionMessage
     logprobs: Optional[List[TokenLogProbs]] = None
 
 
-@json_schema_type
-class BatchChatCompletionRequest(BaseModel):
-    model: str
-    messages_batch: List[List[Message]]
-    sampling_params: Optional[SamplingParams] = SamplingParams()
-
-    # zero-shot tool definitions as input to the model
-    tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
-    tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
-    tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
-    logprobs: Optional[LogProbConfig] = None
-
-
-@json_schema_type
-class BatchChatCompletionResponse(BaseModel):
-    batch: List[ChatCompletionResponse]
-
-
 @json_schema_type
 class EmbeddingsResponse(BaseModel):
     embeddings: List[List[float]]
@@ -303,7 +263,19 @@ async def completion(
         response_format: Optional[ResponseFormat] = None,
         stream: Optional[bool] = False,
         logprobs: Optional[LogProbConfig] = None,
-    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]: ...
+    ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
+        """Generate a completion for the given content using the specified model.
+
+        :param model_id: The identifier of the model to use
+        :param content: The content to generate a completion for
+        :param sampling_params: (Optional) Parameters to control the sampling strategy
+        :param response_format: (Optional) Grammar specification for guided (structured) decoding
+        :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
+        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+        :returns: If stream=False, returns a CompletionResponse with the full completion.
+                 If stream=True, returns an SSE event stream of CompletionResponseStreamChunk
+        """
+        ...
 
     @webmethod(route="/inference/chat-completion", method="POST")
     async def chat_completion(
@@ -311,7 +283,6 @@ async def chat_completion(
         model_id: str,
         messages: List[Message],
         sampling_params: Optional[SamplingParams] = SamplingParams(),
-        # zero-shot tool definitions as input to the model
         tools: Optional[List[ToolDefinition]] = None,
         tool_choice: Optional[ToolChoice] = ToolChoice.auto,
         tool_prompt_format: Optional[ToolPromptFormat] = None,
@@ -320,11 +291,33 @@ async def chat_completion(
         logprobs: Optional[LogProbConfig] = None,
     ) -> Union[
         ChatCompletionResponse, AsyncIterator[ChatCompletionResponseStreamChunk]
-    ]: ...
+    ]:
+        """Generate a chat completion for the given messages using the specified model.
+
+        :param model_id: The identifier of the model to use
+        :param messages: List of messages in the conversation
+        :param sampling_params: Parameters to control the sampling strategy
+        :param tools: (Optional) List of tool definitions available to the model
+        :param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
+        :param tool_prompt_format: (Optional) Specifies how tool definitions are formatted when presenting to the model
+        :param response_format: (Optional) Grammar specification for guided (structured) decoding
+        :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
+        :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
+        :returns: If stream=False, returns a ChatCompletionResponse with the full completion.
+                 If stream=True, returns an SSE event stream of ChatCompletionResponseStreamChunk
+        """
+        ...
 
     @webmethod(route="/inference/embeddings", method="POST")
     async def embeddings(
         self,
         model_id: str,
         contents: List[InterleavedContent],
-    ) -> EmbeddingsResponse: ...
+    ) -> EmbeddingsResponse:
+        """Generate embeddings for content pieces using the specified model.
+
+        :param model_id: The identifier of the model to use
+        :param contents: List of contents to generate embeddings for. Note that content can be multimodal.
+        :returns: An array of embeddings, one for each content. Each embedding is a list of floats.
+        """
+        ...

From 62c3c5bb7e3c3b9ded50f6928ceac87f5799da3b Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Wed, 29 Jan 2025 06:32:54 -0800
Subject: [PATCH 2/5] Add and review more documentation for inference.py

---
 docs/openapi_generator/pyopenapi/generator.py |   3 +-
 docs/resources/llama-stack-spec.html          | 194 +++++++++++-------
 docs/resources/llama-stack-spec.yaml          | 184 ++++++++++++++---
 llama_stack/apis/inference/inference.py       | 151 ++++++++++++--
 4 files changed, 415 insertions(+), 117 deletions(-)

diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index d8e0d81ed1..390f0c6271 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -4,11 +4,10 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-import collections
 import hashlib
 import ipaddress
 import typing
-from dataclasses import field, make_dataclass
+from dataclasses import make_dataclass
 from typing import Any, Dict, Set, Union
 
 from ..strong_typing.core import JsonType
diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html
index b720bef21e..58fa770104 100644
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@@ -487,7 +487,7 @@
             "post": {
                 "responses": {
                     "200": {
-                        "description": "An array of embeddings, one for each content. Each embedding is a list of floats.",
+                        "description": "An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}",
                         "content": {
                             "application/json": {
                                 "schema": {
@@ -2352,19 +2352,23 @@
                     "role": {
                         "type": "string",
                         "const": "assistant",
-                        "default": "assistant"
+                        "default": "assistant",
+                        "description": "Must be \"assistant\" to identify this as the model's response"
                     },
                     "content": {
-                        "$ref": "#/components/schemas/InterleavedContent"
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content of the model's response"
                     },
                     "stop_reason": {
-                        "$ref": "#/components/schemas/StopReason"
+                        "$ref": "#/components/schemas/StopReason",
+                        "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
                     },
                     "tool_calls": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/ToolCall"
-                        }
+                        },
+                        "description": "List of tool calls. Each tool call is a ToolCall object."
                     }
                 },
                 "additionalProperties": false,
@@ -2373,7 +2377,8 @@
                     "content",
                     "stop_reason",
                     "tool_calls"
-                ]
+                ],
+                "title": "A message containing the model's (assistant) response in a chat conversation."
             },
             "GrammarResponseFormat": {
                 "type": "object",
@@ -2381,7 +2386,8 @@
                     "type": {
                         "type": "string",
                         "const": "grammar",
-                        "default": "grammar"
+                        "default": "grammar",
+                        "description": "Must be \"grammar\" to identify this format type"
                     },
                     "bnf": {
                         "type": "object",
@@ -2406,14 +2412,16 @@
                                     "type": "object"
                                 }
                             ]
-                        }
+                        },
+                        "description": "The BNF grammar specification the response should conform to"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
                     "type",
                     "bnf"
-                ]
+                ],
+                "title": "Configuration for grammar-guided response generation."
             },
             "GreedySamplingStrategy": {
                 "type": "object",
@@ -2496,7 +2504,8 @@
                     "type": {
                         "type": "string",
                         "const": "json_schema",
-                        "default": "json_schema"
+                        "default": "json_schema",
+                        "description": "Must be \"json_schema\" to identify this format type"
                     },
                     "json_schema": {
                         "type": "object",
@@ -2521,14 +2530,16 @@
                                     "type": "object"
                                 }
                             ]
-                        }
+                        },
+                        "description": "The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
                     "type",
                     "json_schema"
-                ]
+                ],
+                "title": "Configuration for JSON schema-guided response generation."
             },
             "Message": {
                 "oneOf": [
@@ -2624,17 +2635,20 @@
                     "role": {
                         "type": "string",
                         "const": "system",
-                        "default": "system"
+                        "default": "system",
+                        "description": "Must be \"system\" to identify this as a system message"
                     },
                     "content": {
-                        "$ref": "#/components/schemas/InterleavedContent"
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content of the \"system prompt\". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions)."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
                     "role",
                     "content"
-                ]
+                ],
+                "title": "A system message providing instructions or context to the model."
             },
             "TextContentItem": {
                 "type": "object",
@@ -2749,7 +2763,8 @@
                 "enum": [
                     "auto",
                     "required"
-                ]
+                ],
+                "title": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
             },
             "ToolDefinition": {
                 "type": "object",
@@ -2836,10 +2851,12 @@
                     "role": {
                         "type": "string",
                         "const": "tool",
-                        "default": "tool"
+                        "default": "tool",
+                        "description": "Must be \"tool\" to identify this as a tool response"
                     },
                     "call_id": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "Unique identifier for the tool call this response is for"
                     },
                     "tool_name": {
                         "oneOf": [
@@ -2849,10 +2866,12 @@
                             {
                                 "type": "string"
                             }
-                        ]
+                        ],
+                        "description": "Name of the tool that was called"
                     },
                     "content": {
-                        "$ref": "#/components/schemas/InterleavedContent"
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The response content from the tool"
                     }
                 },
                 "additionalProperties": false,
@@ -2861,7 +2880,8 @@
                     "call_id",
                     "tool_name",
                     "content"
-                ]
+                ],
+                "title": "A message representing the result of a tool invocation."
             },
             "TopKSamplingStrategy": {
                 "type": "object",
@@ -2920,20 +2940,24 @@
                     "role": {
                         "type": "string",
                         "const": "user",
-                        "default": "user"
+                        "default": "user",
+                        "description": "Must be \"user\" to identify this as a user message"
                     },
                     "content": {
-                        "$ref": "#/components/schemas/InterleavedContent"
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "The content of the message, which can include text and other media"
                     },
                     "context": {
-                        "$ref": "#/components/schemas/InterleavedContent"
+                        "$ref": "#/components/schemas/InterleavedContent",
+                        "description": "(Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future."
                     }
                 },
                 "additionalProperties": false,
                 "required": [
                     "role",
                     "content"
-                ]
+                ],
+                "title": "A message from the user in a chat conversation."
             },
             "BatchChatCompletionRequest": {
                 "type": "object",
@@ -2973,7 +2997,8 @@
                         "properties": {
                             "top_k": {
                                 "type": "integer",
-                                "default": 0
+                                "default": 0,
+                                "description": "How many tokens (for each position) to return log probabilities for."
                             }
                         },
                         "additionalProperties": false
@@ -3004,19 +3029,22 @@
                 "type": "object",
                 "properties": {
                     "completion_message": {
-                        "$ref": "#/components/schemas/CompletionMessage"
+                        "$ref": "#/components/schemas/CompletionMessage",
+                        "description": "The complete response message"
                     },
                     "logprobs": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/TokenLogProbs"
-                        }
+                        },
+                        "description": "Optional log probabilities for generated tokens"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
                     "completion_message"
-                ]
+                ],
+                "title": "Response from a chat completion request."
             },
             "TokenLogProbs": {
                 "type": "object",
@@ -3025,13 +3053,15 @@
                         "type": "object",
                         "additionalProperties": {
                             "type": "number"
-                        }
+                        },
+                        "description": "Dictionary mapping tokens to their log probabilities"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
                     "logprobs_by_token"
-                ]
+                ],
+                "title": "Log probabilities for generated tokens."
             },
             "BatchCompletionRequest": {
                 "type": "object",
@@ -3056,7 +3086,8 @@
                         "properties": {
                             "top_k": {
                                 "type": "integer",
-                                "default": 0
+                                "default": 0,
+                                "description": "How many tokens (for each position) to return log probabilities for."
                             }
                         },
                         "additionalProperties": false
@@ -3087,16 +3118,19 @@
                 "type": "object",
                 "properties": {
                     "content": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "The generated completion text"
                     },
                     "stop_reason": {
-                        "$ref": "#/components/schemas/StopReason"
+                        "$ref": "#/components/schemas/StopReason",
+                        "description": "Reason why generation stopped"
                     },
                     "logprobs": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/TokenLogProbs"
-                        }
+                        },
+                        "description": "Optional log probabilities for generated tokens"
                     }
                 },
                 "additionalProperties": false,
@@ -3104,7 +3138,7 @@
                     "content",
                     "stop_reason"
                 ],
-                "title": "Completion response."
+                "title": "Response from a completion request."
             },
             "CancelTrainingJobRequest": {
                 "type": "object",
@@ -3123,7 +3157,7 @@
                 "properties": {
                     "model_id": {
                         "type": "string",
-                        "description": "The identifier of the model to use"
+                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
                     },
                     "messages": {
                         "type": "array",
@@ -3149,11 +3183,11 @@
                     },
                     "tool_prompt_format": {
                         "$ref": "#/components/schemas/ToolPromptFormat",
-                        "description": "(Optional) Specifies how tool definitions are formatted when presenting to the model"
+                        "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
                     },
                     "response_format": {
                         "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding"
+                        "description": "(Optional) Grammar specification for guided (structured) decoding. There are two options: - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format. - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it."
                     },
                     "stream": {
                         "type": "boolean",
@@ -3164,7 +3198,8 @@
                         "properties": {
                             "top_k": {
                                 "type": "integer",
-                                "default": 0
+                                "default": 0,
+                                "description": "How many tokens (for each position) to return log probabilities for."
                             }
                         },
                         "additionalProperties": false,
@@ -3181,19 +3216,23 @@
                 "type": "object",
                 "properties": {
                     "event_type": {
-                        "$ref": "#/components/schemas/ChatCompletionResponseEventType"
+                        "$ref": "#/components/schemas/ChatCompletionResponseEventType",
+                        "description": "Type of the event"
                     },
                     "delta": {
-                        "$ref": "#/components/schemas/ContentDelta"
+                        "$ref": "#/components/schemas/ContentDelta",
+                        "description": "Content generated since last event. This can be one or more tokens, or a tool call."
                     },
                     "logprobs": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/TokenLogProbs"
-                        }
+                        },
+                        "description": "Optional log probabilities for generated tokens"
                     },
                     "stop_reason": {
-                        "$ref": "#/components/schemas/StopReason"
+                        "$ref": "#/components/schemas/StopReason",
+                        "description": "Optional reason why generation stopped, if complete"
                     }
                 },
                 "additionalProperties": false,
@@ -3201,7 +3240,7 @@
                     "event_type",
                     "delta"
                 ],
-                "title": "Chat completion response event."
+                "title": "An event during chat completion generation."
             },
             "ChatCompletionResponseEventType": {
                 "type": "string",
@@ -3209,19 +3248,22 @@
                     "start",
                     "complete",
                     "progress"
-                ]
+                ],
+                "title": "Types of events that can occur during chat completion."
             },
             "ChatCompletionResponseStreamChunk": {
                 "type": "object",
                 "properties": {
                     "event": {
-                        "$ref": "#/components/schemas/ChatCompletionResponseEvent"
+                        "$ref": "#/components/schemas/ChatCompletionResponseEvent",
+                        "description": "The event containing the new content"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
                     "event"
-                ]
+                ],
+                "title": "A chunk of a streamed chat completion response."
             },
             "ContentDelta": {
                 "oneOf": [
@@ -3324,7 +3366,7 @@
                 "properties": {
                     "model_id": {
                         "type": "string",
-                        "description": "The identifier of the model to use"
+                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
                     },
                     "content": {
                         "$ref": "#/components/schemas/InterleavedContent",
@@ -3347,7 +3389,8 @@
                         "properties": {
                             "top_k": {
                                 "type": "integer",
-                                "default": 0
+                                "default": 0,
+                                "description": "How many tokens (for each position) to return log probabilities for."
                             }
                         },
                         "additionalProperties": false,
@@ -3364,23 +3407,26 @@
                 "type": "object",
                 "properties": {
                     "delta": {
-                        "type": "string"
+                        "type": "string",
+                        "description": "New content generated since last chunk. This can be one or more tokens."
                     },
                     "stop_reason": {
-                        "$ref": "#/components/schemas/StopReason"
+                        "$ref": "#/components/schemas/StopReason",
+                        "description": "Optional reason why generation stopped, if complete"
                     },
                     "logprobs": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/TokenLogProbs"
-                        }
+                        },
+                        "description": "Optional log probabilities for generated tokens"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
                     "delta"
                 ],
-                "title": "streamed completion response."
+                "title": "A chunk of a streamed completion response."
             },
             "AgentConfig": {
                 "type": "object",
@@ -4264,14 +4310,14 @@
                 "properties": {
                     "model_id": {
                         "type": "string",
-                        "description": "The identifier of the model to use"
+                        "description": "The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint."
                     },
                     "contents": {
                         "type": "array",
                         "items": {
                             "$ref": "#/components/schemas/InterleavedContent"
                         },
-                        "description": "List of contents to generate embeddings for. Note that content can be multimodal."
+                        "description": "List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text."
                     }
                 },
                 "additionalProperties": false,
@@ -4290,13 +4336,15 @@
                             "items": {
                                 "type": "number"
                             }
-                        }
+                        },
+                        "description": "List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}"
                     }
                 },
                 "additionalProperties": false,
                 "required": [
                     "embeddings"
-                ]
+                ],
+                "title": "Response containing generated embeddings."
             },
             "AgentCandidate": {
                 "type": "object",
@@ -7887,19 +7935,19 @@
         },
         {
             "name": "ChatCompletionResponse",
-            "description": ""
+            "description": "Response from a chat completion request."
         },
         {
             "name": "ChatCompletionResponseEvent",
-            "description": "Chat completion response event."
+            "description": "An event during chat completion generation."
         },
         {
             "name": "ChatCompletionResponseEventType",
-            "description": ""
+            "description": "Types of events that can occur during chat completion."
         },
         {
             "name": "ChatCompletionResponseStreamChunk",
-            "description": ""
+            "description": "A chunk of a streamed chat completion response."
         },
         {
             "name": "Checkpoint",
@@ -7911,7 +7959,7 @@
         },
         {
             "name": "CompletionMessage",
-            "description": ""
+            "description": "A message containing the model's (assistant) response in a chat conversation."
         },
         {
             "name": "CompletionRequest",
@@ -7919,11 +7967,11 @@
         },
         {
             "name": "CompletionResponse",
-            "description": "Completion response."
+            "description": "Response from a completion request."
         },
         {
             "name": "CompletionResponseStreamChunk",
-            "description": "streamed completion response."
+            "description": "A chunk of a streamed completion response."
         },
         {
             "name": "ContentDelta",
@@ -7977,7 +8025,7 @@
         },
         {
             "name": "EmbeddingsResponse",
-            "description": ""
+            "description": "Response containing generated embeddings."
         },
         {
             "name": "Eval"
@@ -8011,7 +8059,7 @@
         },
         {
             "name": "GrammarResponseFormat",
-            "description": ""
+            "description": "Configuration for grammar-guided response generation."
         },
         {
             "name": "GreedySamplingStrategy",
@@ -8069,7 +8117,7 @@
         },
         {
             "name": "JsonSchemaResponseFormat",
-            "description": ""
+            "description": "Configuration for JSON schema-guided response generation."
         },
         {
             "name": "JsonType",
@@ -8434,7 +8482,7 @@
         },
         {
             "name": "SystemMessage",
-            "description": ""
+            "description": "A system message providing instructions or context to the model."
         },
         {
             "name": "Telemetry"
@@ -8449,7 +8497,7 @@
         },
         {
             "name": "TokenLogProbs",
-            "description": ""
+            "description": "Log probabilities for generated tokens."
         },
         {
             "name": "Tool",
@@ -8469,7 +8517,7 @@
         },
         {
             "name": "ToolChoice",
-            "description": ""
+            "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
         },
         {
             "name": "ToolDef",
@@ -8516,7 +8564,7 @@
         },
         {
             "name": "ToolResponseMessage",
-            "description": ""
+            "description": "A message representing the result of a tool invocation."
         },
         {
             "name": "ToolRuntime"
@@ -8555,7 +8603,7 @@
         },
         {
             "name": "UserMessage",
-            "description": ""
+            "description": "A message from the user in a chat conversation."
         },
         {
             "name": "VectorDB",
diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml
index 353d99d00a..efe3882fb7 100644
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@@ -291,7 +291,8 @@ paths:
         '200':
           description: >-
             An array of embeddings, one for each content. Each embedding is a list
-            of floats.
+            of floats. The dimensionality of the embedding is model-specific; you
+            can check model metadata using /models/{model_id}
           content:
             application/json:
               schema:
@@ -1396,20 +1397,34 @@ components:
           type: string
           const: assistant
           default: assistant
+          description: >-
+            Must be "assistant" to identify this as the model's response
         content:
           $ref: '#/components/schemas/InterleavedContent'
+          description: The content of the model's response
         stop_reason:
           $ref: '#/components/schemas/StopReason'
+          description: >-
+            Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
+            The model finished generating the entire response. - `StopReason.end_of_message`:
+            The model finished generating but generated a partial response -- usually,
+            a tool call. The user may call the tool and continue the conversation
+            with the tool's response. - `StopReason.out_of_tokens`: The model ran
+            out of token budget.
         tool_calls:
           type: array
           items:
             $ref: '#/components/schemas/ToolCall'
+          description: >-
+            List of tool calls. Each tool call is a ToolCall object.
       additionalProperties: false
       required:
         - role
         - content
         - stop_reason
         - tool_calls
+      title: >-
+        A message containing the model's (assistant) response in a chat conversation.
     GrammarResponseFormat:
       type: object
       properties:
@@ -1417,6 +1432,8 @@ components:
           type: string
           const: grammar
           default: grammar
+          description: >-
+            Must be "grammar" to identify this format type
         bnf:
           type: object
           additionalProperties:
@@ -1427,10 +1444,14 @@ components:
               - type: string
               - type: array
               - type: object
+          description: >-
+            The BNF grammar specification the response should conform to
       additionalProperties: false
       required:
         - type
         - bnf
+      title: >-
+        Configuration for grammar-guided response generation.
     GreedySamplingStrategy:
       type: object
       properties:
@@ -1484,6 +1505,8 @@ components:
           type: string
           const: json_schema
           default: json_schema
+          description: >-
+            Must be "json_schema" to identify this format type
         json_schema:
           type: object
           additionalProperties:
@@ -1494,10 +1517,15 @@ components:
               - type: string
               - type: array
               - type: object
+          description: >-
+            The JSON schema the response should conform to. In a Python SDK, this
+            is often a `pydantic` model.
       additionalProperties: false
       required:
         - type
         - json_schema
+      title: >-
+        Configuration for JSON schema-guided response generation.
     Message:
       oneOf:
         - $ref: '#/components/schemas/UserMessage'
@@ -1556,12 +1584,20 @@ components:
           type: string
           const: system
           default: system
+          description: >-
+            Must be "system" to identify this as a system message
         content:
           $ref: '#/components/schemas/InterleavedContent'
+          description: >-
+            The content of the "system prompt". If multiple system messages are provided,
+            they are concatenated. The underlying Llama Stack code may also add other
+            system messages (for example, for formatting tool definitions).
       additionalProperties: false
       required:
         - role
         - content
+      title: >-
+        A system message providing instructions or context to the model.
     TextContentItem:
       type: object
       properties:
@@ -1619,6 +1655,10 @@ components:
       enum:
         - auto
         - required
+      title: >-
+        Whether tool use is required or automatic. This is a hint to the model which
+        may not be followed. It depends on the Instruction Following capabilities
+        of the model.
     ToolDefinition:
       type: object
       properties:
@@ -1691,20 +1731,28 @@ components:
           type: string
           const: tool
           default: tool
+          description: >-
+            Must be "tool" to identify this as a tool response
         call_id:
           type: string
+          description: >-
+            Unique identifier for the tool call this response is for
         tool_name:
           oneOf:
             - $ref: '#/components/schemas/BuiltinTool'
             - type: string
+          description: Name of the tool that was called
         content:
           $ref: '#/components/schemas/InterleavedContent'
+          description: The response content from the tool
       additionalProperties: false
       required:
         - role
         - call_id
         - tool_name
         - content
+      title: >-
+        A message representing the result of a tool invocation.
     TopKSamplingStrategy:
       type: object
       properties:
@@ -1748,14 +1796,23 @@ components:
           type: string
           const: user
           default: user
+          description: >-
+            Must be "user" to identify this as a user message
         content:
           $ref: '#/components/schemas/InterleavedContent'
+          description: >-
+            The content of the message, which can include text and other media
         context:
           $ref: '#/components/schemas/InterleavedContent'
+          description: >-
+            (Optional) This field is used internally by Llama Stack to pass RAG context.
+            This field may be removed in the API in the future.
       additionalProperties: false
       required:
         - role
         - content
+      title: >-
+        A message from the user in a chat conversation.
     BatchChatCompletionRequest:
       type: object
       properties:
@@ -1785,6 +1842,8 @@ components:
             top_k:
               type: integer
               default: 0
+              description: >-
+                How many tokens (for each position) to return log probabilities for.
           additionalProperties: false
       additionalProperties: false
       required:
@@ -1805,13 +1864,17 @@ components:
       properties:
         completion_message:
           $ref: '#/components/schemas/CompletionMessage'
+          description: The complete response message
         logprobs:
           type: array
           items:
             $ref: '#/components/schemas/TokenLogProbs'
+          description: >-
+            Optional log probabilities for generated tokens
       additionalProperties: false
       required:
         - completion_message
+      title: Response from a chat completion request.
     TokenLogProbs:
       type: object
       properties:
@@ -1819,9 +1882,12 @@ components:
           type: object
           additionalProperties:
             type: number
+          description: >-
+            Dictionary mapping tokens to their log probabilities
       additionalProperties: false
       required:
         - logprobs_by_token
+      title: Log probabilities for generated tokens.
     BatchCompletionRequest:
       type: object
       properties:
@@ -1841,6 +1907,8 @@ components:
             top_k:
               type: integer
               default: 0
+              description: >-
+                How many tokens (for each position) to return log probabilities for.
           additionalProperties: false
       additionalProperties: false
       required:
@@ -1861,17 +1929,21 @@ components:
       properties:
         content:
           type: string
+          description: The generated completion text
         stop_reason:
           $ref: '#/components/schemas/StopReason'
+          description: Reason why generation stopped
         logprobs:
           type: array
           items:
             $ref: '#/components/schemas/TokenLogProbs'
+          description: >-
+            Optional log probabilities for generated tokens
       additionalProperties: false
       required:
         - content
         - stop_reason
-      title: Completion response.
+      title: Response from a completion request.
     CancelTrainingJobRequest:
       type: object
       properties:
@@ -1885,7 +1957,9 @@ components:
       properties:
         model_id:
           type: string
-          description: The identifier of the model to use
+          description: >-
+            The identifier of the model to use. The model must be registered with
+            Llama Stack and available via the /models endpoint.
         messages:
           type: array
           items:
@@ -1908,12 +1982,20 @@ components:
         tool_prompt_format:
           $ref: '#/components/schemas/ToolPromptFormat'
           description: >-
-            (Optional) Specifies how tool definitions are formatted when presenting
-            to the model
+            (Optional) Instructs the model how to format tool calls. By default, Llama
+            Stack will attempt to use a format that is best adapted to the model.
+            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name>
+            tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python
+            syntax -- a list of function calls.
         response_format:
           $ref: '#/components/schemas/ResponseFormat'
           description: >-
-            (Optional) Grammar specification for guided (structured) decoding
+            (Optional) Grammar specification for guided (structured) decoding. There
+            are two options: - `ResponseFormat.json_schema`: The grammar is a JSON
+            schema. Most providers support this format. - `ResponseFormat.grammar`:
+            The grammar is a BNF grammar. This format is more flexible, but not all
+            providers support it.
         stream:
           type: boolean
           description: >-
@@ -1925,6 +2007,8 @@ components:
             top_k:
               type: integer
               default: 0
+              description: >-
+                How many tokens (for each position) to return log probabilities for.
           additionalProperties: false
           description: >-
             (Optional) If specified, log probabilities for each token position will
@@ -1938,33 +2022,47 @@ components:
       properties:
         event_type:
           $ref: '#/components/schemas/ChatCompletionResponseEventType'
+          description: Type of the event
         delta:
           $ref: '#/components/schemas/ContentDelta'
+          description: >-
+            Content generated since last event. This can be one or more tokens, or
+            a tool call.
         logprobs:
           type: array
           items:
             $ref: '#/components/schemas/TokenLogProbs'
+          description: >-
+            Optional log probabilities for generated tokens
         stop_reason:
           $ref: '#/components/schemas/StopReason'
+          description: >-
+            Optional reason why generation stopped, if complete
       additionalProperties: false
       required:
         - event_type
         - delta
-      title: Chat completion response event.
+      title: >-
+        An event during chat completion generation.
     ChatCompletionResponseEventType:
       type: string
       enum:
         - start
         - complete
         - progress
+      title: >-
+        Types of events that can occur during chat completion.
     ChatCompletionResponseStreamChunk:
       type: object
       properties:
         event:
           $ref: '#/components/schemas/ChatCompletionResponseEvent'
+          description: The event containing the new content
       additionalProperties: false
       required:
         - event
+      title: >-
+        A chunk of a streamed chat completion response.
     ContentDelta:
       oneOf:
         - $ref: '#/components/schemas/TextDelta'
@@ -2033,7 +2131,9 @@ components:
       properties:
         model_id:
           type: string
-          description: The identifier of the model to use
+          description: >-
+            The identifier of the model to use. The model must be registered with
+            Llama Stack and available via the /models endpoint.
         content:
           $ref: '#/components/schemas/InterleavedContent'
           description: The content to generate a completion for
@@ -2056,6 +2156,8 @@ components:
             top_k:
               type: integer
               default: 0
+              description: >-
+                How many tokens (for each position) to return log probabilities for.
           additionalProperties: false
           description: >-
             (Optional) If specified, log probabilities for each token position will
@@ -2069,16 +2171,23 @@ components:
       properties:
         delta:
           type: string
+          description: >-
+            New content generated since last chunk. This can be one or more tokens.
         stop_reason:
           $ref: '#/components/schemas/StopReason'
+          description: >-
+            Optional reason why generation stopped, if complete
         logprobs:
           type: array
           items:
             $ref: '#/components/schemas/TokenLogProbs'
+          description: >-
+            Optional log probabilities for generated tokens
       additionalProperties: false
       required:
         - delta
-      title: streamed completion response.
+      title: >-
+        A chunk of a streamed completion response.
     AgentConfig:
       type: object
       properties:
@@ -2633,14 +2742,17 @@ components:
       properties:
         model_id:
           type: string
-          description: The identifier of the model to use
+          description: >-
+            The identifier of the model to use. The model must be an embedding model
+            registered with Llama Stack and available via the /models endpoint.
         contents:
           type: array
           items:
             $ref: '#/components/schemas/InterleavedContent'
           description: >-
             List of contents to generate embeddings for. Note that content can be
-            multimodal.
+            multimodal. The behavior depends on the model and provider. Some models
+            may only support text.
       additionalProperties: false
       required:
         - model_id
@@ -2654,9 +2766,15 @@ components:
             type: array
             items:
               type: number
+          description: >-
+            List of embedding vectors, one per input content. Each embedding is a
+            list of floats. The dimensionality of the embedding is model-specific;
+            you can check model metadata using /models/{model_id}
       additionalProperties: false
       required:
         - embeddings
+      title: >-
+        Response containing generated embeddings.
     AgentCandidate:
       type: object
       properties:
@@ -4833,25 +4951,30 @@ tags:
   - name: ChatCompletionRequest
     description: ''
   - name: ChatCompletionResponse
-    description: ''
+    description: Response from a chat completion request.
   - name: ChatCompletionResponseEvent
-    description: Chat completion response event.
+    description: >-
+      An event during chat completion generation.
   - name: ChatCompletionResponseEventType
-    description: ''
+    description: >-
+      Types of events that can occur during chat completion.
   - name: ChatCompletionResponseStreamChunk
-    description: ''
+    description: >-
+      A chunk of a streamed chat completion response.
   - name: Checkpoint
     description: Checkpoint created during training runs
   - name: CompletionInputType
     description: ''
   - name: CompletionMessage
-    description: ''
+    description: >-
+      A message containing the model's (assistant) response in a chat conversation.
   - name: CompletionRequest
     description: ''
   - name: CompletionResponse
-    description: Completion response.
+    description: Response from a completion request.
   - name: CompletionResponseStreamChunk
-    description: streamed completion response.
+    description: >-
+      A chunk of a streamed completion response.
   - name: ContentDelta
     description: ''
   - name: CreateAgentRequest
@@ -4877,7 +5000,8 @@ tags:
   - name: EmbeddingsRequest
     description: ''
   - name: EmbeddingsResponse
-    description: ''
+    description: >-
+      Response containing generated embeddings.
   - name: Eval
   - name: EvalCandidate
     description: ''
@@ -4893,7 +5017,8 @@ tags:
   - name: Event
     description: ''
   - name: GrammarResponseFormat
-    description: ''
+    description: >-
+      Configuration for grammar-guided response generation.
   - name: GreedySamplingStrategy
     description: ''
   - name: HealthInfo
@@ -4921,7 +5046,8 @@ tags:
   - name: JobStatus
     description: ''
   - name: JsonSchemaResponseFormat
-    description: ''
+    description: >-
+      Configuration for JSON schema-guided response generation.
   - name: JsonType
     description: ''
   - name: LLMAsJudgeScoringFnParams
@@ -5104,14 +5230,15 @@ tags:
       Response from the synthetic data generation. Batch of (prompt, response, score)
       tuples that pass the threshold.
   - name: SystemMessage
-    description: ''
+    description: >-
+      A system message providing instructions or context to the model.
   - name: Telemetry
   - name: TextContentItem
     description: ''
   - name: TextDelta
     description: ''
   - name: TokenLogProbs
-    description: ''
+    description: Log probabilities for generated tokens.
   - name: Tool
     description: ''
   - name: ToolCall
@@ -5121,7 +5248,10 @@ tags:
   - name: ToolCallParseStatus
     description: ''
   - name: ToolChoice
-    description: ''
+    description: >-
+      Whether tool use is required or automatic. This is a hint to the model which
+      may not be followed. It depends on the Instruction Following capabilities of
+      the model.
   - name: ToolDef
     description: ''
   - name: ToolDefinition
@@ -5166,7 +5296,8 @@ tags:
   - name: ToolResponse
     description: ''
   - name: ToolResponseMessage
-    description: ''
+    description: >-
+      A message representing the result of a tool invocation.
   - name: ToolRuntime
   - name: TopKSamplingStrategy
     description: ''
@@ -5186,7 +5317,8 @@ tags:
   - name: UnstructuredLogEvent
     description: ''
   - name: UserMessage
-    description: ''
+    description: >-
+      A message from the user in a chat conversation.
   - name: VectorDB
     description: ''
   - name: VectorDBs
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 36f385eb27..4541761755 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -35,11 +35,23 @@
 
 
 class LogProbConfig(BaseModel):
+    """
+
+    :param top_k: How many tokens (for each position) to return log probabilities for.
+    """
+
     top_k: Optional[int] = 0
 
 
 @json_schema_type
 class QuantizationType(Enum):
+    """Type of model quantization to run inference with.
+
+    :cvar bf16: BFloat16 typically this means _no_ quantization
+    :cvar fp8: 8-bit floating point quantization
+    :cvar int4: 4-bit integer quantization
+    """
+
     bf16 = "bf16"
     fp8 = "fp8"
     int4 = "int4"
@@ -57,6 +69,12 @@ class Bf16QuantizationConfig(BaseModel):
 
 @json_schema_type
 class Int4QuantizationConfig(BaseModel):
+    """Configuration for 4-bit integer quantization.
+
+    :param type: Must be "int4" to identify this quantization type
+    :param scheme: Quantization scheme to use. Defaults to "int4_weight_int8_dynamic_activation"
+    """
+
     type: Literal["int4"] = "int4"
     scheme: Optional[str] = "int4_weight_int8_dynamic_activation"
 
@@ -69,6 +87,13 @@ class Int4QuantizationConfig(BaseModel):
 
 @json_schema_type
 class UserMessage(BaseModel):
+    """A message from the user in a chat conversation.
+
+    :param role: Must be "user" to identify this as a user message
+    :param content: The content of the message, which can include text and other media
+    :param context: (Optional) This field is used internally by Llama Stack to pass RAG context. This field may be removed in the API in the future.
+    """
+
     role: Literal["user"] = "user"
     content: InterleavedContent
     context: Optional[InterleavedContent] = None
@@ -76,15 +101,27 @@ class UserMessage(BaseModel):
 
 @json_schema_type
 class SystemMessage(BaseModel):
+    """A system message providing instructions or context to the model.
+
+    :param role: Must be "system" to identify this as a system message
+    :param content: The content of the "system prompt". If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages (for example, for formatting tool definitions).
+    """
+
     role: Literal["system"] = "system"
     content: InterleavedContent
 
 
 @json_schema_type
 class ToolResponseMessage(BaseModel):
+    """A message representing the result of a tool invocation.
+
+    :param role: Must be "tool" to identify this as a tool response
+    :param call_id: Unique identifier for the tool call this response is for
+    :param tool_name: Name of the tool that was called
+    :param content: The response content from the tool
+    """
+
     role: Literal["tool"] = "tool"
-    # it was nice to re-use the ToolResponse type, but having all messages
-    # have a `content` type makes things nicer too
     call_id: str
     tool_name: Union[BuiltinTool, str]
     content: InterleavedContent
@@ -92,6 +129,17 @@ class ToolResponseMessage(BaseModel):
 
 @json_schema_type
 class CompletionMessage(BaseModel):
+    """A message containing the model's (assistant) response in a chat conversation.
+
+    :param role: Must be "assistant" to identify this as the model's response
+    :param content: The content of the model's response
+    :param stop_reason: Reason why the model stopped generating. Options are:
+        - `StopReason.end_of_turn`: The model finished generating the entire response.
+        - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response.
+        - `StopReason.out_of_tokens`: The model ran out of token budget.
+    :param tool_calls: List of tool calls. Each tool call is a ToolCall object.
+    """
+
     role: Literal["assistant"] = "assistant"
     content: InterleavedContent
     stop_reason: StopReason
@@ -131,17 +179,35 @@ def validate_field(cls, v):
 
 @json_schema_type
 class ToolChoice(Enum):
+    """Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.
+
+    :cvar auto: The model may use tools if it determines that is appropriate.
+    :cvar required: The model must use tools.
+    """
+
     auto = "auto"
     required = "required"
 
 
 @json_schema_type
 class TokenLogProbs(BaseModel):
+    """Log probabilities for generated tokens.
+
+    :param logprobs_by_token: Dictionary mapping tokens to their log probabilities
+    """
+
     logprobs_by_token: Dict[str, float]
 
 
 @json_schema_type
 class ChatCompletionResponseEventType(Enum):
+    """Types of events that can occur during chat completion.
+
+    :cvar start: Inference has started
+    :cvar complete: Inference is complete and a full response is available
+    :cvar progress: Inference is in progress and a partial response is available
+    """
+
     start = "start"
     complete = "complete"
     progress = "progress"
@@ -149,7 +215,13 @@ class ChatCompletionResponseEventType(Enum):
 
 @json_schema_type
 class ChatCompletionResponseEvent(BaseModel):
-    """Chat completion response event."""
+    """An event during chat completion generation.
+
+    :param event_type: Type of the event
+    :param delta: Content generated since last event. This can be one or more tokens, or a tool call.
+    :param logprobs: Optional log probabilities for generated tokens
+    :param stop_reason: Optional reason why generation stopped, if complete
+    """
 
     event_type: ChatCompletionResponseEventType
     delta: ContentDelta
@@ -159,12 +231,24 @@ class ChatCompletionResponseEvent(BaseModel):
 
 @json_schema_type
 class ResponseFormatType(Enum):
+    """Types of formats for structured (guided) decoding.
+
+    :cvar json_schema: Response should conform to a JSON schema. In a Python SDK, this is often a `pydantic` model.
+    :cvar grammar: Response should conform to a BNF grammar
+    """
+
     json_schema = "json_schema"
     grammar = "grammar"
 
 
 @json_schema_type
 class JsonSchemaResponseFormat(BaseModel):
+    """Configuration for JSON schema-guided response generation.
+
+    :param type: Must be "json_schema" to identify this format type
+    :param json_schema: The JSON schema the response should conform to. In a Python SDK, this is often a `pydantic` model.
+    """
+
     type: Literal[ResponseFormatType.json_schema.value] = (
         ResponseFormatType.json_schema.value
     )
@@ -173,6 +257,12 @@ class JsonSchemaResponseFormat(BaseModel):
 
 @json_schema_type
 class GrammarResponseFormat(BaseModel):
+    """Configuration for grammar-guided response generation.
+
+    :param type: Must be "grammar" to identify this format type
+    :param bnf: The BNF grammar specification the response should conform to
+    """
+
     type: Literal[ResponseFormatType.grammar.value] = ResponseFormatType.grammar.value
     bnf: Dict[str, Any]
 
@@ -186,19 +276,24 @@ class GrammarResponseFormat(BaseModel):
 )
 
 
+# This is an internally used class
 class CompletionRequest(BaseModel):
     model: str
     content: InterleavedContent
     sampling_params: Optional[SamplingParams] = SamplingParams()
     response_format: Optional[ResponseFormat] = None
-
     stream: Optional[bool] = False
     logprobs: Optional[LogProbConfig] = None
 
 
 @json_schema_type
 class CompletionResponse(BaseModel):
-    """Completion response."""
+    """Response from a completion request.
+
+    :param content: The generated completion text
+    :param stop_reason: Reason why generation stopped
+    :param logprobs: Optional log probabilities for generated tokens
+    """
 
     content: str
     stop_reason: StopReason
@@ -207,41 +302,60 @@ class CompletionResponse(BaseModel):
 
 @json_schema_type
 class CompletionResponseStreamChunk(BaseModel):
-    """streamed completion response."""
+    """A chunk of a streamed completion response.
+
+    :param delta: New content generated since last chunk. This can be one or more tokens.
+    :param stop_reason: Optional reason why generation stopped, if complete
+    :param logprobs: Optional log probabilities for generated tokens
+    """
 
     delta: str
     stop_reason: Optional[StopReason] = None
     logprobs: Optional[List[TokenLogProbs]] = None
 
 
+# This is an internally used class
 class ChatCompletionRequest(BaseModel):
     model: str
     messages: List[Message]
     sampling_params: Optional[SamplingParams] = SamplingParams()
-
-    # zero-shot tool definitions as input to the model
     tools: Optional[List[ToolDefinition]] = Field(default_factory=list)
     tool_choice: Optional[ToolChoice] = Field(default=ToolChoice.auto)
     tool_prompt_format: Optional[ToolPromptFormat] = Field(default=None)
     response_format: Optional[ResponseFormat] = None
-
     stream: Optional[bool] = False
     logprobs: Optional[LogProbConfig] = None
 
 
 @json_schema_type
 class ChatCompletionResponseStreamChunk(BaseModel):
+    """A chunk of a streamed chat completion response.
+
+    :param event: The event containing the new content
+    """
+
     event: ChatCompletionResponseEvent
 
 
 @json_schema_type
 class ChatCompletionResponse(BaseModel):
+    """Response from a chat completion request.
+
+    :param completion_message: The complete response message
+    :param logprobs: Optional log probabilities for generated tokens
+    """
+
     completion_message: CompletionMessage
     logprobs: Optional[List[TokenLogProbs]] = None
 
 
 @json_schema_type
 class EmbeddingsResponse(BaseModel):
+    """Response containing generated embeddings.
+
+    :param embeddings: List of embedding vectors, one per input content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
+    """
+
     embeddings: List[List[float]]
 
 
@@ -266,7 +380,7 @@ async def completion(
     ) -> Union[CompletionResponse, AsyncIterator[CompletionResponseStreamChunk]]:
         """Generate a completion for the given content using the specified model.
 
-        :param model_id: The identifier of the model to use
+        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
         :param content: The content to generate a completion for
         :param sampling_params: (Optional) Parameters to control the sampling strategy
         :param response_format: (Optional) Grammar specification for guided (structured) decoding
@@ -294,13 +408,18 @@ async def chat_completion(
     ]:
         """Generate a chat completion for the given messages using the specified model.
 
-        :param model_id: The identifier of the model to use
+        :param model_id: The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint.
         :param messages: List of messages in the conversation
         :param sampling_params: Parameters to control the sampling strategy
         :param tools: (Optional) List of tool definitions available to the model
         :param tool_choice: (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
-        :param tool_prompt_format: (Optional) Specifies how tool definitions are formatted when presenting to the model
-        :param response_format: (Optional) Grammar specification for guided (structured) decoding
+        :param tool_prompt_format: (Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model.
+            - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object.
+            - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag.
+            - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls.
+        :param response_format: (Optional) Grammar specification for guided (structured) decoding. There are two options:
+            - `ResponseFormat.json_schema`: The grammar is a JSON schema. Most providers support this format.
+            - `ResponseFormat.grammar`: The grammar is a BNF grammar. This format is more flexible, but not all providers support it.
         :param stream: (Optional) If True, generate an SSE event stream of the response. Defaults to False.
         :param logprobs: (Optional) If specified, log probabilities for each token position will be returned.
         :returns: If stream=False, returns a ChatCompletionResponse with the full completion.
@@ -316,8 +435,8 @@ async def embeddings(
     ) -> EmbeddingsResponse:
         """Generate embeddings for content pieces using the specified model.
 
-        :param model_id: The identifier of the model to use
-        :param contents: List of contents to generate embeddings for. Note that content can be multimodal.
-        :returns: An array of embeddings, one for each content. Each embedding is a list of floats.
+        :param model_id: The identifier of the model to use. The model must be an embedding model registered with Llama Stack and available via the /models endpoint.
+        :param contents: List of contents to generate embeddings for. Note that content can be multimodal. The behavior depends on the model and provider. Some models may only support text.
+        :returns: An array of embeddings, one for each content. Each embedding is a list of floats. The dimensionality of the embedding is model-specific; you can check model metadata using /models/{model_id}
         """
         ...

From 3e2a751f542898ba226adfc8cb308cbd62d5851e Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Wed, 29 Jan 2025 09:09:51 -0800
Subject: [PATCH 3/5] Flatten enums

---
 docs/resources/llama-stack-spec.html     | 247 +++++++++++-----------
 docs/resources/llama-stack-spec.yaml     | 248 +++++++++++------------
 llama_stack/apis/common/content_types.py |   1 -
 llama_stack/apis/inference/inference.py  |   4 -
 llama_stack/apis/resource.py             |   2 -
 5 files changed, 243 insertions(+), 259 deletions(-)

diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html
index 58fa770104..15c9c9484c 100644
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@@ -2337,15 +2337,6 @@
                     "rows"
                 ]
             },
-            "BuiltinTool": {
-                "type": "string",
-                "enum": [
-                    "brave_search",
-                    "wolfram_alpha",
-                    "photogen",
-                    "code_interpreter"
-                ]
-            },
             "CompletionMessage": {
                 "type": "object",
                 "properties": {
@@ -2360,7 +2351,12 @@
                         "description": "The content of the model's response"
                     },
                     "stop_reason": {
-                        "$ref": "#/components/schemas/StopReason",
+                        "type": "string",
+                        "enum": [
+                            "end_of_turn",
+                            "end_of_message",
+                            "out_of_tokens"
+                        ],
                         "description": "Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`: The model finished generating the entire response. - `StopReason.end_of_message`: The model finished generating but generated a partial response -- usually, a tool call. The user may call the tool and continue the conversation with the tool's response. - `StopReason.out_of_tokens`: The model ran out of token budget."
                     },
                     "tool_calls": {
@@ -2587,25 +2583,7 @@
                 "type": "object",
                 "properties": {
                     "strategy": {
-                        "oneOf": [
-                            {
-                                "$ref": "#/components/schemas/GreedySamplingStrategy"
-                            },
-                            {
-                                "$ref": "#/components/schemas/TopPSamplingStrategy"
-                            },
-                            {
-                                "$ref": "#/components/schemas/TopKSamplingStrategy"
-                            }
-                        ],
-                        "discriminator": {
-                            "propertyName": "type",
-                            "mapping": {
-                                "greedy": "#/components/schemas/GreedySamplingStrategy",
-                                "top_p": "#/components/schemas/TopPSamplingStrategy",
-                                "top_k": "#/components/schemas/TopKSamplingStrategy"
-                            }
-                        }
+                        "$ref": "#/components/schemas/SamplingStrategy"
                     },
                     "max_tokens": {
                         "type": "integer",
@@ -2621,13 +2599,26 @@
                     "strategy"
                 ]
             },
-            "StopReason": {
-                "type": "string",
-                "enum": [
-                    "end_of_turn",
-                    "end_of_message",
-                    "out_of_tokens"
-                ]
+            "SamplingStrategy": {
+                "oneOf": [
+                    {
+                        "$ref": "#/components/schemas/GreedySamplingStrategy"
+                    },
+                    {
+                        "$ref": "#/components/schemas/TopPSamplingStrategy"
+                    },
+                    {
+                        "$ref": "#/components/schemas/TopKSamplingStrategy"
+                    }
+                ],
+                "discriminator": {
+                    "propertyName": "type",
+                    "mapping": {
+                        "greedy": "#/components/schemas/GreedySamplingStrategy",
+                        "top_p": "#/components/schemas/TopPSamplingStrategy",
+                        "top_k": "#/components/schemas/TopKSamplingStrategy"
+                    }
+                }
             },
             "SystemMessage": {
                 "type": "object",
@@ -2677,7 +2668,13 @@
                     "tool_name": {
                         "oneOf": [
                             {
-                                "$ref": "#/components/schemas/BuiltinTool"
+                                "type": "string",
+                                "enum": [
+                                    "brave_search",
+                                    "wolfram_alpha",
+                                    "photogen",
+                                    "code_interpreter"
+                                ]
                             },
                             {
                                 "type": "string"
@@ -2758,21 +2755,19 @@
                     "arguments"
                 ]
             },
-            "ToolChoice": {
-                "type": "string",
-                "enum": [
-                    "auto",
-                    "required"
-                ],
-                "title": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
-            },
             "ToolDefinition": {
                 "type": "object",
                 "properties": {
                     "tool_name": {
                         "oneOf": [
                             {
-                                "$ref": "#/components/schemas/BuiltinTool"
+                                "type": "string",
+                                "enum": [
+                                    "brave_search",
+                                    "wolfram_alpha",
+                                    "photogen",
+                                    "code_interpreter"
+                                ]
                             },
                             {
                                 "type": "string"
@@ -2835,16 +2830,6 @@
                     "param_type"
                 ]
             },
-            "ToolPromptFormat": {
-                "type": "string",
-                "enum": [
-                    "json",
-                    "function_tag",
-                    "python_list"
-                ],
-                "title": "This Enum refers to the prompt format for calling custom / zero shot tools",
-                "description": "`json` --\n    Refers to the json format for calling tools.\n    The json format takes the form like\n    {\n        \"type\": \"function\",\n        \"function\" : {\n            \"name\": \"function_name\",\n            \"description\": \"function_description\",\n            \"parameters\": {...}\n        }\n    }\n\n`function_tag` --\n    This is an example of how you could define\n    your own user defined format for making tool calls.\n    The function_tag format looks like this,\n    <function=function_name>(parameters)</function>\n\nThe detailed prompts for each of these formats are added to llama cli"
-            },
             "ToolResponseMessage": {
                 "type": "object",
                 "properties": {
@@ -2861,7 +2846,13 @@
                     "tool_name": {
                         "oneOf": [
                             {
-                                "$ref": "#/components/schemas/BuiltinTool"
+                                "type": "string",
+                                "enum": [
+                                    "brave_search",
+                                    "wolfram_alpha",
+                                    "photogen",
+                                    "code_interpreter"
+                                ]
                             },
                             {
                                 "type": "string"
@@ -2984,10 +2975,22 @@
                         }
                     },
                     "tool_choice": {
-                        "$ref": "#/components/schemas/ToolChoice"
+                        "type": "string",
+                        "enum": [
+                            "auto",
+                            "required"
+                        ],
+                        "title": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
                     },
                     "tool_prompt_format": {
-                        "$ref": "#/components/schemas/ToolPromptFormat"
+                        "type": "string",
+                        "enum": [
+                            "json",
+                            "function_tag",
+                            "python_list"
+                        ],
+                        "title": "This Enum refers to the prompt format for calling custom / zero shot tools",
+                        "description": "The detailed prompts for each of these formats are added to llama cli"
                     },
                     "response_format": {
                         "$ref": "#/components/schemas/ResponseFormat"
@@ -3122,7 +3125,12 @@
                         "description": "The generated completion text"
                     },
                     "stop_reason": {
-                        "$ref": "#/components/schemas/StopReason",
+                        "type": "string",
+                        "enum": [
+                            "end_of_turn",
+                            "end_of_message",
+                            "out_of_tokens"
+                        ],
                         "description": "Reason why generation stopped"
                     },
                     "logprobs": {
@@ -3178,11 +3186,20 @@
                         "description": "(Optional) List of tool definitions available to the model"
                     },
                     "tool_choice": {
-                        "$ref": "#/components/schemas/ToolChoice",
+                        "type": "string",
+                        "enum": [
+                            "auto",
+                            "required"
+                        ],
                         "description": "(Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto."
                     },
                     "tool_prompt_format": {
-                        "$ref": "#/components/schemas/ToolPromptFormat",
+                        "type": "string",
+                        "enum": [
+                            "json",
+                            "function_tag",
+                            "python_list"
+                        ],
                         "description": "(Optional) Instructs the model how to format tool calls. By default, Llama Stack will attempt to use a format that is best adapted to the model. - `ToolPromptFormat.json`: The tool calls are formatted as a JSON object. - `ToolPromptFormat.function_tag`: The tool calls are enclosed in a <function=function_name> tag. - `ToolPromptFormat.python_list`: The tool calls are output as Python syntax -- a list of function calls."
                     },
                     "response_format": {
@@ -3216,7 +3233,12 @@
                 "type": "object",
                 "properties": {
                     "event_type": {
-                        "$ref": "#/components/schemas/ChatCompletionResponseEventType",
+                        "type": "string",
+                        "enum": [
+                            "start",
+                            "complete",
+                            "progress"
+                        ],
                         "description": "Type of the event"
                     },
                     "delta": {
@@ -3231,7 +3253,12 @@
                         "description": "Optional log probabilities for generated tokens"
                     },
                     "stop_reason": {
-                        "$ref": "#/components/schemas/StopReason",
+                        "type": "string",
+                        "enum": [
+                            "end_of_turn",
+                            "end_of_message",
+                            "out_of_tokens"
+                        ],
                         "description": "Optional reason why generation stopped, if complete"
                     }
                 },
@@ -3242,15 +3269,6 @@
                 ],
                 "title": "An event during chat completion generation."
             },
-            "ChatCompletionResponseEventType": {
-                "type": "string",
-                "enum": [
-                    "start",
-                    "complete",
-                    "progress"
-                ],
-                "title": "Types of events that can occur during chat completion."
-            },
             "ChatCompletionResponseStreamChunk": {
                 "type": "object",
                 "properties": {
@@ -3342,7 +3360,13 @@
                         ]
                     },
                     "parse_status": {
-                        "$ref": "#/components/schemas/ToolCallParseStatus"
+                        "type": "string",
+                        "enum": [
+                            "started",
+                            "in_progress",
+                            "failed",
+                            "succeeded"
+                        ]
                     }
                 },
                 "additionalProperties": false,
@@ -3352,15 +3376,6 @@
                     "parse_status"
                 ]
             },
-            "ToolCallParseStatus": {
-                "type": "string",
-                "enum": [
-                    "started",
-                    "in_progress",
-                    "failed",
-                    "succeeded"
-                ]
-            },
             "CompletionRequest": {
                 "type": "object",
                 "properties": {
@@ -3411,7 +3426,12 @@
                         "description": "New content generated since last chunk. This can be one or more tokens."
                     },
                     "stop_reason": {
-                        "$ref": "#/components/schemas/StopReason",
+                        "type": "string",
+                        "enum": [
+                            "end_of_turn",
+                            "end_of_message",
+                            "out_of_tokens"
+                        ],
                         "description": "Optional reason why generation stopped, if complete"
                     },
                     "logprobs": {
@@ -3459,11 +3479,23 @@
                         }
                     },
                     "tool_choice": {
-                        "$ref": "#/components/schemas/ToolChoice",
+                        "type": "string",
+                        "enum": [
+                            "auto",
+                            "required"
+                        ],
+                        "title": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.",
                         "default": "auto"
                     },
                     "tool_prompt_format": {
-                        "$ref": "#/components/schemas/ToolPromptFormat"
+                        "type": "string",
+                        "enum": [
+                            "json",
+                            "function_tag",
+                            "python_list"
+                        ],
+                        "title": "This Enum refers to the prompt format for calling custom / zero shot tools",
+                        "description": "The detailed prompts for each of these formats are added to llama cli"
                     },
                     "max_infer_iters": {
                         "type": "integer",
@@ -4170,7 +4202,13 @@
                     "tool_name": {
                         "oneOf": [
                             {
-                                "$ref": "#/components/schemas/BuiltinTool"
+                                "type": "string",
+                                "enum": [
+                                    "brave_search",
+                                    "wolfram_alpha",
+                                    "photogen",
+                                    "code_interpreter"
+                                ]
                             },
                             {
                                 "type": "string"
@@ -7917,10 +7955,6 @@
             "name": "BooleanType",
             "description": ""
         },
-        {
-            "name": "BuiltinTool",
-            "description": ""
-        },
         {
             "name": "CancelTrainingJobRequest",
             "description": ""
@@ -7941,10 +7975,6 @@
             "name": "ChatCompletionResponseEvent",
             "description": "An event during chat completion generation."
         },
-        {
-            "name": "ChatCompletionResponseEventType",
-            "description": "Types of events that can occur during chat completion."
-        },
         {
             "name": "ChatCompletionResponseStreamChunk",
             "description": "A chunk of a streamed chat completion response."
@@ -8376,6 +8406,10 @@
             "name": "SamplingParams",
             "description": ""
         },
+        {
+            "name": "SamplingStrategy",
+            "description": ""
+        },
         {
             "name": "SaveSpansToDatasetRequest",
             "description": ""
@@ -8449,10 +8483,6 @@
             "name": "SpanWithStatus",
             "description": ""
         },
-        {
-            "name": "StopReason",
-            "description": ""
-        },
         {
             "name": "StringType",
             "description": ""
@@ -8511,14 +8541,6 @@
             "name": "ToolCallDelta",
             "description": ""
         },
-        {
-            "name": "ToolCallParseStatus",
-            "description": ""
-        },
-        {
-            "name": "ToolChoice",
-            "description": "Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model."
-        },
         {
             "name": "ToolDef",
             "description": ""
@@ -8554,10 +8576,6 @@
             "name": "ToolParameter",
             "description": ""
         },
-        {
-            "name": "ToolPromptFormat",
-            "description": "This Enum refers to the prompt format for calling custom / zero shot tools\n\n`json` --\n    Refers to the json format for calling tools.\n    The json format takes the form like\n    {\n        \"type\": \"function\",\n        \"function\" : {\n            \"name\": \"function_name\",\n            \"description\": \"function_description\",\n            \"parameters\": {...}\n        }\n    }\n\n`function_tag` --\n    This is an example of how you could define\n    your own user defined format for making tool calls.\n    The function_tag format looks like this,\n    <function=function_name>(parameters)</function>\n\nThe detailed prompts for each of these formats are added to llama cli"
-        },
         {
             "name": "ToolResponse",
             "description": ""
@@ -8680,13 +8698,11 @@
                 "BatchCompletionResponse",
                 "BenchmarkEvalTaskConfig",
                 "BooleanType",
-                "BuiltinTool",
                 "CancelTrainingJobRequest",
                 "ChatCompletionInputType",
                 "ChatCompletionRequest",
                 "ChatCompletionResponse",
                 "ChatCompletionResponseEvent",
-                "ChatCompletionResponseEventType",
                 "ChatCompletionResponseStreamChunk",
                 "Checkpoint",
                 "CompletionInputType",
@@ -8788,6 +8804,7 @@
                 "RunShieldResponse",
                 "SafetyViolation",
                 "SamplingParams",
+                "SamplingStrategy",
                 "SaveSpansToDatasetRequest",
                 "ScoreBatchRequest",
                 "ScoreBatchResponse",
@@ -8804,7 +8821,6 @@
                 "SpanStartPayload",
                 "SpanStatus",
                 "SpanWithStatus",
-                "StopReason",
                 "StringType",
                 "StructuredLogEvent",
                 "StructuredLogPayload",
@@ -8818,8 +8834,6 @@
                 "Tool",
                 "ToolCall",
                 "ToolCallDelta",
-                "ToolCallParseStatus",
-                "ToolChoice",
                 "ToolDef",
                 "ToolDefinition",
                 "ToolExecutionStep",
@@ -8828,7 +8842,6 @@
                 "ToolInvocationResult",
                 "ToolParamDefinition",
                 "ToolParameter",
-                "ToolPromptFormat",
                 "ToolResponse",
                 "ToolResponseMessage",
                 "TopKSamplingStrategy",
diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml
index efe3882fb7..a83b3afe5d 100644
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@@ -1383,13 +1383,6 @@ components:
       required:
         - dataset_id
         - rows
-    BuiltinTool:
-      type: string
-      enum:
-        - brave_search
-        - wolfram_alpha
-        - photogen
-        - code_interpreter
     CompletionMessage:
       type: object
       properties:
@@ -1403,7 +1396,11 @@ components:
           $ref: '#/components/schemas/InterleavedContent'
           description: The content of the model's response
         stop_reason:
-          $ref: '#/components/schemas/StopReason'
+          type: string
+          enum:
+            - end_of_turn
+            - end_of_message
+            - out_of_tokens
           description: >-
             Reason why the model stopped generating. Options are: - `StopReason.end_of_turn`:
             The model finished generating the entire response. - `StopReason.end_of_message`:
@@ -1552,16 +1549,7 @@ components:
       type: object
       properties:
         strategy:
-          oneOf:
-            - $ref: '#/components/schemas/GreedySamplingStrategy'
-            - $ref: '#/components/schemas/TopPSamplingStrategy'
-            - $ref: '#/components/schemas/TopKSamplingStrategy'
-          discriminator:
-            propertyName: type
-            mapping:
-              greedy: '#/components/schemas/GreedySamplingStrategy'
-              top_p: '#/components/schemas/TopPSamplingStrategy'
-              top_k: '#/components/schemas/TopKSamplingStrategy'
+          $ref: '#/components/schemas/SamplingStrategy'
         max_tokens:
           type: integer
           default: 0
@@ -1571,12 +1559,17 @@ components:
       additionalProperties: false
       required:
         - strategy
-    StopReason:
-      type: string
-      enum:
-        - end_of_turn
-        - end_of_message
-        - out_of_tokens
+    SamplingStrategy:
+      oneOf:
+        - $ref: '#/components/schemas/GreedySamplingStrategy'
+        - $ref: '#/components/schemas/TopPSamplingStrategy'
+        - $ref: '#/components/schemas/TopKSamplingStrategy'
+      discriminator:
+        propertyName: type
+        mapping:
+          greedy: '#/components/schemas/GreedySamplingStrategy'
+          top_p: '#/components/schemas/TopPSamplingStrategy'
+          top_k: '#/components/schemas/TopKSamplingStrategy'
     SystemMessage:
       type: object
       properties:
@@ -1618,7 +1611,12 @@ components:
           type: string
         tool_name:
           oneOf:
-            - $ref: '#/components/schemas/BuiltinTool'
+            - type: string
+              enum:
+                - brave_search
+                - wolfram_alpha
+                - photogen
+                - code_interpreter
             - type: string
         arguments:
           type: object
@@ -1650,21 +1648,17 @@ components:
         - call_id
         - tool_name
         - arguments
-    ToolChoice:
-      type: string
-      enum:
-        - auto
-        - required
-      title: >-
-        Whether tool use is required or automatic. This is a hint to the model which
-        may not be followed. It depends on the Instruction Following capabilities
-        of the model.
     ToolDefinition:
       type: object
       properties:
         tool_name:
           oneOf:
-            - $ref: '#/components/schemas/BuiltinTool'
+            - type: string
+              enum:
+                - brave_search
+                - wolfram_alpha
+                - photogen
+                - code_interpreter
             - type: string
         description:
           type: string
@@ -1696,34 +1690,6 @@ components:
       additionalProperties: false
       required:
         - param_type
-    ToolPromptFormat:
-      type: string
-      enum:
-        - json
-        - function_tag
-        - python_list
-      title: >-
-        This Enum refers to the prompt format for calling custom / zero shot tools
-      description: >-
-        `json` --
-            Refers to the json format for calling tools.
-            The json format takes the form like
-            {
-                "type": "function",
-                "function" : {
-                    "name": "function_name",
-                    "description": "function_description",
-                    "parameters": {...}
-                }
-            }
-
-        `function_tag` --
-            This is an example of how you could define
-            your own user defined format for making tool calls.
-            The function_tag format looks like this,
-            <function=function_name>(parameters)</function>
-
-        The detailed prompts for each of these formats are added to llama cli
     ToolResponseMessage:
       type: object
       properties:
@@ -1739,7 +1705,12 @@ components:
             Unique identifier for the tool call this response is for
         tool_name:
           oneOf:
-            - $ref: '#/components/schemas/BuiltinTool'
+            - type: string
+              enum:
+                - brave_search
+                - wolfram_alpha
+                - photogen
+                - code_interpreter
             - type: string
           description: Name of the tool that was called
         content:
@@ -1831,9 +1802,24 @@ components:
           items:
             $ref: '#/components/schemas/ToolDefinition'
         tool_choice:
-          $ref: '#/components/schemas/ToolChoice'
+          type: string
+          enum:
+            - auto
+            - required
+          title: >-
+            Whether tool use is required or automatic. This is a hint to the model
+            which may not be followed. It depends on the Instruction Following capabilities
+            of the model.
         tool_prompt_format:
-          $ref: '#/components/schemas/ToolPromptFormat'
+          type: string
+          enum:
+            - json
+            - function_tag
+            - python_list
+          title: >-
+            This Enum refers to the prompt format for calling custom / zero shot tools
+          description: >-
+            The detailed prompts for each of these formats are added to llama cli
         response_format:
           $ref: '#/components/schemas/ResponseFormat'
         logprobs:
@@ -1931,7 +1917,11 @@ components:
           type: string
           description: The generated completion text
         stop_reason:
-          $ref: '#/components/schemas/StopReason'
+          type: string
+          enum:
+            - end_of_turn
+            - end_of_message
+            - out_of_tokens
           description: Reason why generation stopped
         logprobs:
           type: array
@@ -1976,11 +1966,18 @@ components:
           description: >-
             (Optional) List of tool definitions available to the model
         tool_choice:
-          $ref: '#/components/schemas/ToolChoice'
+          type: string
+          enum:
+            - auto
+            - required
           description: >-
             (Optional) Whether tool use is required or automatic. Defaults to ToolChoice.auto.
         tool_prompt_format:
-          $ref: '#/components/schemas/ToolPromptFormat'
+          type: string
+          enum:
+            - json
+            - function_tag
+            - python_list
           description: >-
             (Optional) Instructs the model how to format tool calls. By default, Llama
             Stack will attempt to use a format that is best adapted to the model.
@@ -2021,7 +2018,11 @@ components:
       type: object
       properties:
         event_type:
-          $ref: '#/components/schemas/ChatCompletionResponseEventType'
+          type: string
+          enum:
+            - start
+            - complete
+            - progress
           description: Type of the event
         delta:
           $ref: '#/components/schemas/ContentDelta'
@@ -2035,7 +2036,11 @@ components:
           description: >-
             Optional log probabilities for generated tokens
         stop_reason:
-          $ref: '#/components/schemas/StopReason'
+          type: string
+          enum:
+            - end_of_turn
+            - end_of_message
+            - out_of_tokens
           description: >-
             Optional reason why generation stopped, if complete
       additionalProperties: false
@@ -2044,14 +2049,6 @@ components:
         - delta
       title: >-
         An event during chat completion generation.
-    ChatCompletionResponseEventType:
-      type: string
-      enum:
-        - start
-        - complete
-        - progress
-      title: >-
-        Types of events that can occur during chat completion.
     ChatCompletionResponseStreamChunk:
       type: object
       properties:
@@ -2113,19 +2110,17 @@ components:
             - type: string
             - $ref: '#/components/schemas/ToolCall'
         parse_status:
-          $ref: '#/components/schemas/ToolCallParseStatus'
+          type: string
+          enum:
+            - started
+            - in_progress
+            - failed
+            - succeeded
       additionalProperties: false
       required:
         - type
         - tool_call
         - parse_status
-    ToolCallParseStatus:
-      type: string
-      enum:
-        - started
-        - in_progress
-        - failed
-        - succeeded
     CompletionRequest:
       type: object
       properties:
@@ -2174,7 +2169,11 @@ components:
           description: >-
             New content generated since last chunk. This can be one or more tokens.
         stop_reason:
-          $ref: '#/components/schemas/StopReason'
+          type: string
+          enum:
+            - end_of_turn
+            - end_of_message
+            - out_of_tokens
           description: >-
             Optional reason why generation stopped, if complete
         logprobs:
@@ -2210,10 +2209,25 @@ components:
           items:
             $ref: '#/components/schemas/ToolDef'
         tool_choice:
-          $ref: '#/components/schemas/ToolChoice'
+          type: string
+          enum:
+            - auto
+            - required
+          title: >-
+            Whether tool use is required or automatic. This is a hint to the model
+            which may not be followed. It depends on the Instruction Following capabilities
+            of the model.
           default: auto
         tool_prompt_format:
-          $ref: '#/components/schemas/ToolPromptFormat'
+          type: string
+          enum:
+            - json
+            - function_tag
+            - python_list
+          title: >-
+            This Enum refers to the prompt format for calling custom / zero shot tools
+          description: >-
+            The detailed prompts for each of these formats are added to llama cli
         max_infer_iters:
           type: integer
           default: 10
@@ -2656,7 +2670,12 @@ components:
           type: string
         tool_name:
           oneOf:
-            - $ref: '#/components/schemas/BuiltinTool'
+            - type: string
+              enum:
+                - brave_search
+                - wolfram_alpha
+                - photogen
+                - code_interpreter
             - type: string
         content:
           $ref: '#/components/schemas/InterleavedContent'
@@ -4942,8 +4961,6 @@ tags:
     description: ''
   - name: BooleanType
     description: ''
-  - name: BuiltinTool
-    description: ''
   - name: CancelTrainingJobRequest
     description: ''
   - name: ChatCompletionInputType
@@ -4955,9 +4972,6 @@ tags:
   - name: ChatCompletionResponseEvent
     description: >-
       An event during chat completion generation.
-  - name: ChatCompletionResponseEventType
-    description: >-
-      Types of events that can occur during chat completion.
   - name: ChatCompletionResponseStreamChunk
     description: >-
       A chunk of a streamed chat completion response.
@@ -5175,6 +5189,8 @@ tags:
     description: ''
   - name: SamplingParams
     description: ''
+  - name: SamplingStrategy
+    description: ''
   - name: SaveSpansToDatasetRequest
     description: ''
   - name: ScoreBatchRequest
@@ -5212,8 +5228,6 @@ tags:
     description: ''
   - name: SpanWithStatus
     description: ''
-  - name: StopReason
-    description: ''
   - name: StringType
     description: ''
   - name: StructuredLogEvent
@@ -5245,13 +5259,6 @@ tags:
     description: ''
   - name: ToolCallDelta
     description: ''
-  - name: ToolCallParseStatus
-    description: ''
-  - name: ToolChoice
-    description: >-
-      Whether tool use is required or automatic. This is a hint to the model which
-      may not be followed. It depends on the Instruction Following capabilities of
-      the model.
   - name: ToolDef
     description: ''
   - name: ToolDefinition
@@ -5269,30 +5276,6 @@ tags:
     description: ''
   - name: ToolParameter
     description: ''
-  - name: ToolPromptFormat
-    description: >-
-      This Enum refers to the prompt format for calling custom / zero shot tools
-
-
-      `json` --
-          Refers to the json format for calling tools.
-          The json format takes the form like
-          {
-              "type": "function",
-              "function" : {
-                  "name": "function_name",
-                  "description": "function_description",
-                  "parameters": {...}
-              }
-          }
-
-      `function_tag` --
-          This is an example of how you could define
-          your own user defined format for making tool calls.
-          The function_tag format looks like this,
-          <function=function_name>(parameters)</function>
-
-      The detailed prompts for each of these formats are added to llama cli
   - name: ToolResponse
     description: ''
   - name: ToolResponseMessage
@@ -5379,13 +5362,11 @@ x-tagGroups:
       - BatchCompletionResponse
       - BenchmarkEvalTaskConfig
       - BooleanType
-      - BuiltinTool
       - CancelTrainingJobRequest
       - ChatCompletionInputType
       - ChatCompletionRequest
       - ChatCompletionResponse
       - ChatCompletionResponseEvent
-      - ChatCompletionResponseEventType
       - ChatCompletionResponseStreamChunk
       - Checkpoint
       - CompletionInputType
@@ -5487,6 +5468,7 @@ x-tagGroups:
       - RunShieldResponse
       - SafetyViolation
       - SamplingParams
+      - SamplingStrategy
       - SaveSpansToDatasetRequest
       - ScoreBatchRequest
       - ScoreBatchResponse
@@ -5503,7 +5485,6 @@ x-tagGroups:
       - SpanStartPayload
       - SpanStatus
       - SpanWithStatus
-      - StopReason
       - StringType
       - StructuredLogEvent
       - StructuredLogPayload
@@ -5517,8 +5498,6 @@ x-tagGroups:
       - Tool
       - ToolCall
       - ToolCallDelta
-      - ToolCallParseStatus
-      - ToolChoice
       - ToolDef
       - ToolDefinition
       - ToolExecutionStep
@@ -5527,7 +5506,6 @@ x-tagGroups:
       - ToolInvocationResult
       - ToolParamDefinition
       - ToolParameter
-      - ToolPromptFormat
       - ToolResponse
       - ToolResponseMessage
       - TopKSamplingStrategy
diff --git a/llama_stack/apis/common/content_types.py b/llama_stack/apis/common/content_types.py
index 1d8cea5676..0b27a0196e 100644
--- a/llama_stack/apis/common/content_types.py
+++ b/llama_stack/apis/common/content_types.py
@@ -77,7 +77,6 @@ class ImageDelta(BaseModel):
     image: bytes
 
 
-@json_schema_type
 class ToolCallParseStatus(Enum):
     started = "started"
     in_progress = "in_progress"
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index 4541761755..a163943ebc 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -43,7 +43,6 @@ class LogProbConfig(BaseModel):
     top_k: Optional[int] = 0
 
 
-@json_schema_type
 class QuantizationType(Enum):
     """Type of model quantization to run inference with.
 
@@ -177,7 +176,6 @@ def validate_field(cls, v):
         return v
 
 
-@json_schema_type
 class ToolChoice(Enum):
     """Whether tool use is required or automatic. This is a hint to the model which may not be followed. It depends on the Instruction Following capabilities of the model.
 
@@ -199,7 +197,6 @@ class TokenLogProbs(BaseModel):
     logprobs_by_token: Dict[str, float]
 
 
-@json_schema_type
 class ChatCompletionResponseEventType(Enum):
     """Types of events that can occur during chat completion.
 
@@ -229,7 +226,6 @@ class ChatCompletionResponseEvent(BaseModel):
     stop_reason: Optional[StopReason] = None
 
 
-@json_schema_type
 class ResponseFormatType(Enum):
     """Types of formats for structured (guided) decoding.
 
diff --git a/llama_stack/apis/resource.py b/llama_stack/apis/resource.py
index d0ce726447..b84c619e4f 100644
--- a/llama_stack/apis/resource.py
+++ b/llama_stack/apis/resource.py
@@ -6,11 +6,9 @@
 
 from enum import Enum
 
-from llama_models.schema_utils import json_schema_type
 from pydantic import BaseModel, Field
 
 
-@json_schema_type
 class ResourceType(Enum):
     model = "model"
     shield = "shield"

From e3174bd62d30f0fc8fee02d41aad4c4414c3b092 Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Wed, 29 Jan 2025 09:30:44 -0800
Subject: [PATCH 4/5] Simplify Tags

---
 docs/openapi_generator/pyopenapi/generator.py |  14 -
 docs/resources/llama-stack-spec.html          | 944 +-----------------
 docs/resources/llama-stack-spec.yaml          | 608 +----------
 llama_stack/apis/agents/agents.py             |  10 +
 llama_stack/apis/inference/inference.py       |   7 +
 5 files changed, 55 insertions(+), 1528 deletions(-)

diff --git a/docs/openapi_generator/pyopenapi/generator.py b/docs/openapi_generator/pyopenapi/generator.py
index 390f0c6271..202d3732b4 100644
--- a/docs/openapi_generator/pyopenapi/generator.py
+++ b/docs/openapi_generator/pyopenapi/generator.py
@@ -677,12 +677,6 @@ def generate(self) -> Document:
                 )
             )
 
-        # types that are produced/consumed by operations
-        type_tags = [
-            self._build_type_tag(ref, schema)
-            for ref, schema in self.schema_builder.schemas.items()
-        ]
-
         # types that are emitted by events
         event_tags: List[Tag] = []
         events = get_endpoint_events(self.endpoint)
@@ -709,7 +703,6 @@ def generate(self) -> Document:
         # list all operations and types
         tags: List[Tag] = []
         tags.extend(operation_tags)
-        tags.extend(type_tags)
         tags.extend(event_tags)
         for extra_tag_group in extra_tag_groups.values():
             tags.extend(extra_tag_group)
@@ -724,13 +717,6 @@ def generate(self) -> Document:
                     tags=sorted(tag.name for tag in operation_tags),
                 )
             )
-        if type_tags:
-            tag_groups.append(
-                TagGroup(
-                    name=self.options.map("Types"),
-                    tags=sorted(tag.name for tag in type_tags),
-                )
-            )
         if event_tags:
             tag_groups.append(
                 TagGroup(
diff --git a/docs/resources/llama-stack-spec.html b/docs/resources/llama-stack-spec.html
index 15c9c9484c..0454e22ec7 100644
--- a/docs/resources/llama-stack-spec.html
+++ b/docs/resources/llama-stack-spec.html
@@ -2989,8 +2989,7 @@
                             "function_tag",
                             "python_list"
                         ],
-                        "title": "This Enum refers to the prompt format for calling custom / zero shot tools",
-                        "description": "The detailed prompts for each of these formats are added to llama cli"
+                        "title": "Prompt format for calling custom / zero shot tools."
                     },
                     "response_format": {
                         "$ref": "#/components/schemas/ResponseFormat"
@@ -3494,8 +3493,7 @@
                             "function_tag",
                             "python_list"
                         ],
-                        "title": "This Enum refers to the prompt format for calling custom / zero shot tools",
-                        "description": "The detailed prompts for each of these formats are added to llama cli"
+                        "title": "Prompt format for calling custom / zero shot tools."
                     },
                     "max_infer_iters": {
                         "type": "integer",
@@ -7842,804 +7840,68 @@
     ],
     "tags": [
         {
-            "name": "AgentCandidate",
-            "description": ""
-        },
-        {
-            "name": "AgentConfig",
-            "description": ""
-        },
-        {
-            "name": "AgentCreateResponse",
-            "description": ""
-        },
-        {
-            "name": "AgentSessionCreateResponse",
-            "description": ""
-        },
-        {
-            "name": "AgentStepResponse",
-            "description": ""
-        },
-        {
-            "name": "AgentTool",
-            "description": ""
-        },
-        {
-            "name": "AgentTurnInputType",
-            "description": ""
-        },
-        {
-            "name": "AgentTurnResponseEvent",
-            "description": ""
-        },
-        {
-            "name": "AgentTurnResponseEventPayload",
-            "description": ""
-        },
-        {
-            "name": "AgentTurnResponseStepCompletePayload",
-            "description": ""
-        },
-        {
-            "name": "AgentTurnResponseStepProgressPayload",
-            "description": ""
-        },
-        {
-            "name": "AgentTurnResponseStepStartPayload",
-            "description": ""
-        },
-        {
-            "name": "AgentTurnResponseStreamChunk",
-            "description": "streamed agent turn completion response."
-        },
-        {
-            "name": "AgentTurnResponseTurnCompletePayload",
-            "description": ""
-        },
-        {
-            "name": "AgentTurnResponseTurnStartPayload",
-            "description": ""
-        },
-        {
-            "name": "Agents"
-        },
-        {
-            "name": "AggregationFunctionType",
-            "description": ""
-        },
-        {
-            "name": "AlgorithmConfig",
-            "description": ""
-        },
-        {
-            "name": "AppEvalTaskConfig",
-            "description": ""
-        },
-        {
-            "name": "AppendRowsRequest",
-            "description": ""
-        },
-        {
-            "name": "ArrayType",
-            "description": ""
-        },
-        {
-            "name": "BasicScoringFnParams",
-            "description": ""
-        },
-        {
-            "name": "BatchChatCompletionRequest",
-            "description": ""
-        },
-        {
-            "name": "BatchChatCompletionResponse",
-            "description": ""
-        },
-        {
-            "name": "BatchCompletionRequest",
-            "description": ""
-        },
-        {
-            "name": "BatchCompletionResponse",
-            "description": ""
+            "name": "Agents",
+            "description": "Main functionalities provided by this API:\n- Create agents with specific instructions and ability to use tools.\n- Interactions with agents are grouped into sessions (\"threads\"), and each interaction is called a \"turn\".\n- Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).\n- Agents can be provided with various shields (see the Safety API for more details).\n- Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.",
+            "x-displayName": "Agents API for creating and interacting with agentic systems."
         },
         {
             "name": "BatchInference (Coming Soon)"
         },
-        {
-            "name": "BenchmarkEvalTaskConfig",
-            "description": ""
-        },
-        {
-            "name": "BooleanType",
-            "description": ""
-        },
-        {
-            "name": "CancelTrainingJobRequest",
-            "description": ""
-        },
-        {
-            "name": "ChatCompletionInputType",
-            "description": ""
-        },
-        {
-            "name": "ChatCompletionRequest",
-            "description": ""
-        },
-        {
-            "name": "ChatCompletionResponse",
-            "description": "Response from a chat completion request."
-        },
-        {
-            "name": "ChatCompletionResponseEvent",
-            "description": "An event during chat completion generation."
-        },
-        {
-            "name": "ChatCompletionResponseStreamChunk",
-            "description": "A chunk of a streamed chat completion response."
-        },
-        {
-            "name": "Checkpoint",
-            "description": "Checkpoint created during training runs"
-        },
-        {
-            "name": "CompletionInputType",
-            "description": ""
-        },
-        {
-            "name": "CompletionMessage",
-            "description": "A message containing the model's (assistant) response in a chat conversation."
-        },
-        {
-            "name": "CompletionRequest",
-            "description": ""
-        },
-        {
-            "name": "CompletionResponse",
-            "description": "Response from a completion request."
-        },
-        {
-            "name": "CompletionResponseStreamChunk",
-            "description": "A chunk of a streamed completion response."
-        },
-        {
-            "name": "ContentDelta",
-            "description": ""
-        },
-        {
-            "name": "CreateAgentRequest",
-            "description": ""
-        },
-        {
-            "name": "CreateAgentSessionRequest",
-            "description": ""
-        },
-        {
-            "name": "CreateAgentTurnRequest",
-            "description": ""
-        },
-        {
-            "name": "DPOAlignmentConfig",
-            "description": ""
-        },
-        {
-            "name": "DataConfig",
-            "description": ""
-        },
-        {
-            "name": "Dataset",
-            "description": ""
-        },
-        {
-            "name": "DatasetFormat",
-            "description": ""
-        },
         {
             "name": "DatasetIO"
         },
         {
             "name": "Datasets"
         },
-        {
-            "name": "DefaultRAGQueryGeneratorConfig",
-            "description": ""
-        },
-        {
-            "name": "EfficiencyConfig",
-            "description": ""
-        },
-        {
-            "name": "EmbeddingsRequest",
-            "description": ""
-        },
-        {
-            "name": "EmbeddingsResponse",
-            "description": "Response containing generated embeddings."
-        },
         {
             "name": "Eval"
         },
-        {
-            "name": "EvalCandidate",
-            "description": ""
-        },
-        {
-            "name": "EvalTask",
-            "description": ""
-        },
-        {
-            "name": "EvalTaskConfig",
-            "description": ""
-        },
         {
             "name": "EvalTasks"
         },
         {
-            "name": "EvaluateResponse",
-            "description": ""
-        },
-        {
-            "name": "EvaluateRowsRequest",
-            "description": ""
-        },
-        {
-            "name": "Event",
-            "description": ""
-        },
-        {
-            "name": "GrammarResponseFormat",
-            "description": "Configuration for grammar-guided response generation."
-        },
-        {
-            "name": "GreedySamplingStrategy",
-            "description": ""
-        },
-        {
-            "name": "HealthInfo",
-            "description": ""
-        },
-        {
-            "name": "ImageContentItem",
-            "description": ""
-        },
-        {
-            "name": "ImageDelta",
-            "description": ""
-        },
-        {
-            "name": "Inference"
-        },
-        {
-            "name": "InferenceStep",
-            "description": ""
-        },
-        {
-            "name": "InsertChunksRequest",
-            "description": ""
-        },
-        {
-            "name": "InsertRequest",
-            "description": ""
+            "name": "Inference",
+            "description": "This API provides the raw interface to the underlying models. Two kinds of models are supported:\n- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions.\n- Embedding models: these models generate embeddings to be used for semantic search.",
+            "x-displayName": "Llama Stack Inference API for generating completions, chat completions, and embeddings."
         },
         {
             "name": "Inspect"
         },
-        {
-            "name": "InterleavedContent",
-            "description": ""
-        },
-        {
-            "name": "InterleavedContentItem",
-            "description": ""
-        },
-        {
-            "name": "InvokeToolRequest",
-            "description": ""
-        },
-        {
-            "name": "Job",
-            "description": ""
-        },
-        {
-            "name": "JobStatus",
-            "description": ""
-        },
-        {
-            "name": "JsonSchemaResponseFormat",
-            "description": "Configuration for JSON schema-guided response generation."
-        },
-        {
-            "name": "JsonType",
-            "description": ""
-        },
-        {
-            "name": "LLMAsJudgeScoringFnParams",
-            "description": ""
-        },
-        {
-            "name": "LLMRAGQueryGeneratorConfig",
-            "description": ""
-        },
-        {
-            "name": "ListDatasetsResponse",
-            "description": ""
-        },
-        {
-            "name": "ListEvalTasksResponse",
-            "description": ""
-        },
-        {
-            "name": "ListModelsResponse",
-            "description": ""
-        },
-        {
-            "name": "ListPostTrainingJobsResponse",
-            "description": ""
-        },
-        {
-            "name": "ListProvidersResponse",
-            "description": ""
-        },
-        {
-            "name": "ListRoutesResponse",
-            "description": ""
-        },
-        {
-            "name": "ListScoringFunctionsResponse",
-            "description": ""
-        },
-        {
-            "name": "ListShieldsResponse",
-            "description": ""
-        },
-        {
-            "name": "ListToolGroupsResponse",
-            "description": ""
-        },
-        {
-            "name": "ListToolsResponse",
-            "description": ""
-        },
-        {
-            "name": "ListVectorDBsResponse",
-            "description": ""
-        },
-        {
-            "name": "LogEventRequest",
-            "description": ""
-        },
-        {
-            "name": "LogSeverity",
-            "description": ""
-        },
-        {
-            "name": "LoraFinetuningConfig",
-            "description": ""
-        },
-        {
-            "name": "MemoryRetrievalStep",
-            "description": ""
-        },
-        {
-            "name": "Message",
-            "description": ""
-        },
-        {
-            "name": "MetricEvent",
-            "description": ""
-        },
-        {
-            "name": "Model",
-            "description": ""
-        },
-        {
-            "name": "ModelCandidate",
-            "description": ""
-        },
-        {
-            "name": "ModelType",
-            "description": ""
-        },
         {
             "name": "Models"
         },
-        {
-            "name": "NumberType",
-            "description": ""
-        },
-        {
-            "name": "ObjectType",
-            "description": ""
-        },
-        {
-            "name": "OptimizerConfig",
-            "description": ""
-        },
-        {
-            "name": "OptimizerType",
-            "description": ""
-        },
-        {
-            "name": "PaginatedRowsResult",
-            "description": ""
-        },
-        {
-            "name": "ParamType",
-            "description": ""
-        },
         {
             "name": "PostTraining (Coming Soon)"
         },
-        {
-            "name": "PostTrainingJob",
-            "description": ""
-        },
-        {
-            "name": "PostTrainingJobArtifactsResponse",
-            "description": "Artifacts of a finetuning job."
-        },
-        {
-            "name": "PostTrainingJobStatusResponse",
-            "description": "Status of a finetuning job."
-        },
-        {
-            "name": "PreferenceOptimizeRequest",
-            "description": ""
-        },
-        {
-            "name": "ProviderInfo",
-            "description": ""
-        },
-        {
-            "name": "QATFinetuningConfig",
-            "description": ""
-        },
-        {
-            "name": "QueryChunksRequest",
-            "description": ""
-        },
-        {
-            "name": "QueryChunksResponse",
-            "description": ""
-        },
-        {
-            "name": "QueryCondition",
-            "description": ""
-        },
-        {
-            "name": "QueryConditionOp",
-            "description": ""
-        },
-        {
-            "name": "QueryRequest",
-            "description": ""
-        },
-        {
-            "name": "QuerySpanTreeResponse",
-            "description": ""
-        },
-        {
-            "name": "QuerySpansResponse",
-            "description": ""
-        },
-        {
-            "name": "QueryTracesResponse",
-            "description": ""
-        },
-        {
-            "name": "RAGDocument",
-            "description": ""
-        },
-        {
-            "name": "RAGQueryConfig",
-            "description": ""
-        },
-        {
-            "name": "RAGQueryGeneratorConfig",
-            "description": ""
-        },
-        {
-            "name": "RAGQueryResult",
-            "description": ""
-        },
-        {
-            "name": "RegexParserScoringFnParams",
-            "description": ""
-        },
-        {
-            "name": "RegisterDatasetRequest",
-            "description": ""
-        },
-        {
-            "name": "RegisterEvalTaskRequest",
-            "description": ""
-        },
-        {
-            "name": "RegisterModelRequest",
-            "description": ""
-        },
-        {
-            "name": "RegisterScoringFunctionRequest",
-            "description": ""
-        },
-        {
-            "name": "RegisterShieldRequest",
-            "description": ""
-        },
-        {
-            "name": "RegisterToolGroupRequest",
-            "description": ""
-        },
-        {
-            "name": "RegisterVectorDbRequest",
-            "description": ""
-        },
-        {
-            "name": "ResponseFormat",
-            "description": ""
-        },
-        {
-            "name": "RouteInfo",
-            "description": ""
-        },
-        {
-            "name": "RunEvalRequest",
-            "description": ""
-        },
-        {
-            "name": "RunShieldRequest",
-            "description": ""
-        },
-        {
-            "name": "RunShieldResponse",
-            "description": ""
-        },
         {
             "name": "Safety"
         },
-        {
-            "name": "SafetyViolation",
-            "description": ""
-        },
-        {
-            "name": "SamplingParams",
-            "description": ""
-        },
-        {
-            "name": "SamplingStrategy",
-            "description": ""
-        },
-        {
-            "name": "SaveSpansToDatasetRequest",
-            "description": ""
-        },
-        {
-            "name": "ScoreBatchRequest",
-            "description": ""
-        },
-        {
-            "name": "ScoreBatchResponse",
-            "description": ""
-        },
-        {
-            "name": "ScoreRequest",
-            "description": ""
-        },
-        {
-            "name": "ScoreResponse",
-            "description": ""
-        },
         {
             "name": "Scoring"
         },
-        {
-            "name": "ScoringFn",
-            "description": ""
-        },
-        {
-            "name": "ScoringFnParams",
-            "description": ""
-        },
         {
             "name": "ScoringFunctions"
         },
-        {
-            "name": "ScoringResult",
-            "description": ""
-        },
-        {
-            "name": "Session",
-            "description": "A single session of an interaction with an Agentic System."
-        },
-        {
-            "name": "Shield",
-            "description": "A safety shield resource that can be used to check content"
-        },
-        {
-            "name": "ShieldCallStep",
-            "description": ""
-        },
         {
             "name": "Shields"
         },
-        {
-            "name": "Span",
-            "description": ""
-        },
-        {
-            "name": "SpanEndPayload",
-            "description": ""
-        },
-        {
-            "name": "SpanStartPayload",
-            "description": ""
-        },
-        {
-            "name": "SpanStatus",
-            "description": ""
-        },
-        {
-            "name": "SpanWithStatus",
-            "description": ""
-        },
-        {
-            "name": "StringType",
-            "description": ""
-        },
-        {
-            "name": "StructuredLogEvent",
-            "description": ""
-        },
-        {
-            "name": "StructuredLogPayload",
-            "description": ""
-        },
-        {
-            "name": "SupervisedFineTuneRequest",
-            "description": ""
-        },
-        {
-            "name": "SyntheticDataGenerateRequest",
-            "description": ""
-        },
         {
             "name": "SyntheticDataGeneration (Coming Soon)"
         },
-        {
-            "name": "SyntheticDataGenerationResponse",
-            "description": "Response from the synthetic data generation. Batch of (prompt, response, score) tuples that pass the threshold."
-        },
-        {
-            "name": "SystemMessage",
-            "description": "A system message providing instructions or context to the model."
-        },
         {
             "name": "Telemetry"
         },
-        {
-            "name": "TextContentItem",
-            "description": ""
-        },
-        {
-            "name": "TextDelta",
-            "description": ""
-        },
-        {
-            "name": "TokenLogProbs",
-            "description": "Log probabilities for generated tokens."
-        },
-        {
-            "name": "Tool",
-            "description": ""
-        },
-        {
-            "name": "ToolCall",
-            "description": ""
-        },
-        {
-            "name": "ToolCallDelta",
-            "description": ""
-        },
-        {
-            "name": "ToolDef",
-            "description": ""
-        },
-        {
-            "name": "ToolDefinition",
-            "description": ""
-        },
-        {
-            "name": "ToolExecutionStep",
-            "description": ""
-        },
-        {
-            "name": "ToolGroup",
-            "description": ""
-        },
         {
             "name": "ToolGroups"
         },
-        {
-            "name": "ToolHost",
-            "description": ""
-        },
-        {
-            "name": "ToolInvocationResult",
-            "description": ""
-        },
-        {
-            "name": "ToolParamDefinition",
-            "description": ""
-        },
-        {
-            "name": "ToolParameter",
-            "description": ""
-        },
-        {
-            "name": "ToolResponse",
-            "description": ""
-        },
-        {
-            "name": "ToolResponseMessage",
-            "description": "A message representing the result of a tool invocation."
-        },
         {
             "name": "ToolRuntime"
         },
-        {
-            "name": "TopKSamplingStrategy",
-            "description": ""
-        },
-        {
-            "name": "TopPSamplingStrategy",
-            "description": ""
-        },
-        {
-            "name": "Trace",
-            "description": ""
-        },
-        {
-            "name": "TrainingConfig",
-            "description": ""
-        },
-        {
-            "name": "Turn",
-            "description": "A single turn in an interaction with an Agentic System."
-        },
-        {
-            "name": "URL",
-            "description": ""
-        },
-        {
-            "name": "UnionType",
-            "description": ""
-        },
-        {
-            "name": "UnstructuredLogEvent",
-            "description": ""
-        },
-        {
-            "name": "UserMessage",
-            "description": "A message from the user in a chat conversation."
-        },
-        {
-            "name": "VectorDB",
-            "description": ""
-        },
         {
             "name": "VectorDBs"
         },
         {
             "name": "VectorIO"
-        },
-        {
-            "name": "VersionInfo",
-            "description": ""
-        },
-        {
-            "name": "ViolationLevel",
-            "description": ""
         }
     ],
     "x-tagGroups": [
@@ -8667,196 +7929,6 @@
                 "VectorDBs",
                 "VectorIO"
             ]
-        },
-        {
-            "name": "Types",
-            "tags": [
-                "AgentCandidate",
-                "AgentConfig",
-                "AgentCreateResponse",
-                "AgentSessionCreateResponse",
-                "AgentStepResponse",
-                "AgentTool",
-                "AgentTurnInputType",
-                "AgentTurnResponseEvent",
-                "AgentTurnResponseEventPayload",
-                "AgentTurnResponseStepCompletePayload",
-                "AgentTurnResponseStepProgressPayload",
-                "AgentTurnResponseStepStartPayload",
-                "AgentTurnResponseStreamChunk",
-                "AgentTurnResponseTurnCompletePayload",
-                "AgentTurnResponseTurnStartPayload",
-                "AggregationFunctionType",
-                "AlgorithmConfig",
-                "AppEvalTaskConfig",
-                "AppendRowsRequest",
-                "ArrayType",
-                "BasicScoringFnParams",
-                "BatchChatCompletionRequest",
-                "BatchChatCompletionResponse",
-                "BatchCompletionRequest",
-                "BatchCompletionResponse",
-                "BenchmarkEvalTaskConfig",
-                "BooleanType",
-                "CancelTrainingJobRequest",
-                "ChatCompletionInputType",
-                "ChatCompletionRequest",
-                "ChatCompletionResponse",
-                "ChatCompletionResponseEvent",
-                "ChatCompletionResponseStreamChunk",
-                "Checkpoint",
-                "CompletionInputType",
-                "CompletionMessage",
-                "CompletionRequest",
-                "CompletionResponse",
-                "CompletionResponseStreamChunk",
-                "ContentDelta",
-                "CreateAgentRequest",
-                "CreateAgentSessionRequest",
-                "CreateAgentTurnRequest",
-                "DPOAlignmentConfig",
-                "DataConfig",
-                "Dataset",
-                "DatasetFormat",
-                "DefaultRAGQueryGeneratorConfig",
-                "EfficiencyConfig",
-                "EmbeddingsRequest",
-                "EmbeddingsResponse",
-                "EvalCandidate",
-                "EvalTask",
-                "EvalTaskConfig",
-                "EvaluateResponse",
-                "EvaluateRowsRequest",
-                "Event",
-                "GrammarResponseFormat",
-                "GreedySamplingStrategy",
-                "HealthInfo",
-                "ImageContentItem",
-                "ImageDelta",
-                "InferenceStep",
-                "InsertChunksRequest",
-                "InsertRequest",
-                "InterleavedContent",
-                "InterleavedContentItem",
-                "InvokeToolRequest",
-                "Job",
-                "JobStatus",
-                "JsonSchemaResponseFormat",
-                "JsonType",
-                "LLMAsJudgeScoringFnParams",
-                "LLMRAGQueryGeneratorConfig",
-                "ListDatasetsResponse",
-                "ListEvalTasksResponse",
-                "ListModelsResponse",
-                "ListPostTrainingJobsResponse",
-                "ListProvidersResponse",
-                "ListRoutesResponse",
-                "ListScoringFunctionsResponse",
-                "ListShieldsResponse",
-                "ListToolGroupsResponse",
-                "ListToolsResponse",
-                "ListVectorDBsResponse",
-                "LogEventRequest",
-                "LogSeverity",
-                "LoraFinetuningConfig",
-                "MemoryRetrievalStep",
-                "Message",
-                "MetricEvent",
-                "Model",
-                "ModelCandidate",
-                "ModelType",
-                "NumberType",
-                "ObjectType",
-                "OptimizerConfig",
-                "OptimizerType",
-                "PaginatedRowsResult",
-                "ParamType",
-                "PostTrainingJob",
-                "PostTrainingJobArtifactsResponse",
-                "PostTrainingJobStatusResponse",
-                "PreferenceOptimizeRequest",
-                "ProviderInfo",
-                "QATFinetuningConfig",
-                "QueryChunksRequest",
-                "QueryChunksResponse",
-                "QueryCondition",
-                "QueryConditionOp",
-                "QueryRequest",
-                "QuerySpanTreeResponse",
-                "QuerySpansResponse",
-                "QueryTracesResponse",
-                "RAGDocument",
-                "RAGQueryConfig",
-                "RAGQueryGeneratorConfig",
-                "RAGQueryResult",
-                "RegexParserScoringFnParams",
-                "RegisterDatasetRequest",
-                "RegisterEvalTaskRequest",
-                "RegisterModelRequest",
-                "RegisterScoringFunctionRequest",
-                "RegisterShieldRequest",
-                "RegisterToolGroupRequest",
-                "RegisterVectorDbRequest",
-                "ResponseFormat",
-                "RouteInfo",
-                "RunEvalRequest",
-                "RunShieldRequest",
-                "RunShieldResponse",
-                "SafetyViolation",
-                "SamplingParams",
-                "SamplingStrategy",
-                "SaveSpansToDatasetRequest",
-                "ScoreBatchRequest",
-                "ScoreBatchResponse",
-                "ScoreRequest",
-                "ScoreResponse",
-                "ScoringFn",
-                "ScoringFnParams",
-                "ScoringResult",
-                "Session",
-                "Shield",
-                "ShieldCallStep",
-                "Span",
-                "SpanEndPayload",
-                "SpanStartPayload",
-                "SpanStatus",
-                "SpanWithStatus",
-                "StringType",
-                "StructuredLogEvent",
-                "StructuredLogPayload",
-                "SupervisedFineTuneRequest",
-                "SyntheticDataGenerateRequest",
-                "SyntheticDataGenerationResponse",
-                "SystemMessage",
-                "TextContentItem",
-                "TextDelta",
-                "TokenLogProbs",
-                "Tool",
-                "ToolCall",
-                "ToolCallDelta",
-                "ToolDef",
-                "ToolDefinition",
-                "ToolExecutionStep",
-                "ToolGroup",
-                "ToolHost",
-                "ToolInvocationResult",
-                "ToolParamDefinition",
-                "ToolParameter",
-                "ToolResponse",
-                "ToolResponseMessage",
-                "TopKSamplingStrategy",
-                "TopPSamplingStrategy",
-                "Trace",
-                "TrainingConfig",
-                "Turn",
-                "URL",
-                "UnionType",
-                "UnstructuredLogEvent",
-                "UserMessage",
-                "VectorDB",
-                "VersionInfo",
-                "ViolationLevel"
-            ]
         }
     ]
 };
diff --git a/docs/resources/llama-stack-spec.yaml b/docs/resources/llama-stack-spec.yaml
index a83b3afe5d..0734ef236e 100644
--- a/docs/resources/llama-stack-spec.yaml
+++ b/docs/resources/llama-stack-spec.yaml
@@ -1817,9 +1817,7 @@ components:
             - function_tag
             - python_list
           title: >-
-            This Enum refers to the prompt format for calling custom / zero shot tools
-          description: >-
-            The detailed prompts for each of these formats are added to llama cli
+            Prompt format for calling custom / zero shot tools.
         response_format:
           $ref: '#/components/schemas/ResponseFormat'
         logprobs:
@@ -2225,9 +2223,7 @@ components:
             - function_tag
             - python_list
           title: >-
-            This Enum refers to the prompt format for calling custom / zero shot tools
-          description: >-
-            The detailed prompts for each of these formats are added to llama cli
+            Prompt format for calling custom / zero shot tools.
         max_infer_iters:
           type: integer
           default: 10
@@ -4905,411 +4901,54 @@ components:
 security:
   - Default: []
 tags:
-  - name: AgentCandidate
-    description: ''
-  - name: AgentConfig
-    description: ''
-  - name: AgentCreateResponse
-    description: ''
-  - name: AgentSessionCreateResponse
-    description: ''
-  - name: AgentStepResponse
-    description: ''
-  - name: AgentTool
-    description: ''
-  - name: AgentTurnInputType
-    description: ''
-  - name: AgentTurnResponseEvent
-    description: ''
-  - name: AgentTurnResponseEventPayload
-    description: ''
-  - name: AgentTurnResponseStepCompletePayload
-    description: ''
-  - name: AgentTurnResponseStepProgressPayload
-    description: ''
-  - name: AgentTurnResponseStepStartPayload
-    description: ''
-  - name: AgentTurnResponseStreamChunk
-    description: streamed agent turn completion response.
-  - name: AgentTurnResponseTurnCompletePayload
-    description: ''
-  - name: AgentTurnResponseTurnStartPayload
-    description: ''
   - name: Agents
-  - name: AggregationFunctionType
-    description: ''
-  - name: AlgorithmConfig
-    description: ''
-  - name: AppEvalTaskConfig
-    description: ''
-  - name: AppendRowsRequest
-    description: ''
-  - name: ArrayType
-    description: ''
-  - name: BasicScoringFnParams
-    description: ''
-  - name: BatchChatCompletionRequest
-    description: ''
-  - name: BatchChatCompletionResponse
-    description: ''
-  - name: BatchCompletionRequest
-    description: ''
-  - name: BatchCompletionResponse
-    description: ''
-  - name: BatchInference (Coming Soon)
-  - name: BenchmarkEvalTaskConfig
-    description: ''
-  - name: BooleanType
-    description: ''
-  - name: CancelTrainingJobRequest
-    description: ''
-  - name: ChatCompletionInputType
-    description: ''
-  - name: ChatCompletionRequest
-    description: ''
-  - name: ChatCompletionResponse
-    description: Response from a chat completion request.
-  - name: ChatCompletionResponseEvent
-    description: >-
-      An event during chat completion generation.
-  - name: ChatCompletionResponseStreamChunk
-    description: >-
-      A chunk of a streamed chat completion response.
-  - name: Checkpoint
-    description: Checkpoint created during training runs
-  - name: CompletionInputType
-    description: ''
-  - name: CompletionMessage
     description: >-
-      A message containing the model's (assistant) response in a chat conversation.
-  - name: CompletionRequest
-    description: ''
-  - name: CompletionResponse
-    description: Response from a completion request.
-  - name: CompletionResponseStreamChunk
-    description: >-
-      A chunk of a streamed completion response.
-  - name: ContentDelta
-    description: ''
-  - name: CreateAgentRequest
-    description: ''
-  - name: CreateAgentSessionRequest
-    description: ''
-  - name: CreateAgentTurnRequest
-    description: ''
-  - name: DPOAlignmentConfig
-    description: ''
-  - name: DataConfig
-    description: ''
-  - name: Dataset
-    description: ''
-  - name: DatasetFormat
-    description: ''
+      Main functionalities provided by this API:
+
+      - Create agents with specific instructions and ability to use tools.
+
+      - Interactions with agents are grouped into sessions ("threads"), and each interaction
+      is called a "turn".
+
+      - Agents can be provided with various tools (see the ToolGroups and ToolRuntime
+      APIs for more details).
+
+      - Agents can be provided with various shields (see the Safety API for more details).
+
+      - Agents can also use Memory to retrieve information from knowledge bases. See
+      the RAG Tool and Vector IO APIs for more details.
+    x-displayName: >-
+      Agents API for creating and interacting with agentic systems.
+  - name: BatchInference (Coming Soon)
   - name: DatasetIO
   - name: Datasets
-  - name: DefaultRAGQueryGeneratorConfig
-    description: ''
-  - name: EfficiencyConfig
-    description: ''
-  - name: EmbeddingsRequest
-    description: ''
-  - name: EmbeddingsResponse
-    description: >-
-      Response containing generated embeddings.
   - name: Eval
-  - name: EvalCandidate
-    description: ''
-  - name: EvalTask
-    description: ''
-  - name: EvalTaskConfig
-    description: ''
   - name: EvalTasks
-  - name: EvaluateResponse
-    description: ''
-  - name: EvaluateRowsRequest
-    description: ''
-  - name: Event
-    description: ''
-  - name: GrammarResponseFormat
-    description: >-
-      Configuration for grammar-guided response generation.
-  - name: GreedySamplingStrategy
-    description: ''
-  - name: HealthInfo
-    description: ''
-  - name: ImageContentItem
-    description: ''
-  - name: ImageDelta
-    description: ''
   - name: Inference
-  - name: InferenceStep
-    description: ''
-  - name: InsertChunksRequest
-    description: ''
-  - name: InsertRequest
-    description: ''
-  - name: Inspect
-  - name: InterleavedContent
-    description: ''
-  - name: InterleavedContentItem
-    description: ''
-  - name: InvokeToolRequest
-    description: ''
-  - name: Job
-    description: ''
-  - name: JobStatus
-    description: ''
-  - name: JsonSchemaResponseFormat
     description: >-
-      Configuration for JSON schema-guided response generation.
-  - name: JsonType
-    description: ''
-  - name: LLMAsJudgeScoringFnParams
-    description: ''
-  - name: LLMRAGQueryGeneratorConfig
-    description: ''
-  - name: ListDatasetsResponse
-    description: ''
-  - name: ListEvalTasksResponse
-    description: ''
-  - name: ListModelsResponse
-    description: ''
-  - name: ListPostTrainingJobsResponse
-    description: ''
-  - name: ListProvidersResponse
-    description: ''
-  - name: ListRoutesResponse
-    description: ''
-  - name: ListScoringFunctionsResponse
-    description: ''
-  - name: ListShieldsResponse
-    description: ''
-  - name: ListToolGroupsResponse
-    description: ''
-  - name: ListToolsResponse
-    description: ''
-  - name: ListVectorDBsResponse
-    description: ''
-  - name: LogEventRequest
-    description: ''
-  - name: LogSeverity
-    description: ''
-  - name: LoraFinetuningConfig
-    description: ''
-  - name: MemoryRetrievalStep
-    description: ''
-  - name: Message
-    description: ''
-  - name: MetricEvent
-    description: ''
-  - name: Model
-    description: ''
-  - name: ModelCandidate
-    description: ''
-  - name: ModelType
-    description: ''
+      This API provides the raw interface to the underlying models. Two kinds of models
+      are supported:
+
+      - LLM models: these models generate "raw" and "chat" (conversational) completions.
+
+      - Embedding models: these models generate embeddings to be used for semantic
+      search.
+    x-displayName: >-
+      Llama Stack Inference API for generating completions, chat completions, and
+      embeddings.
+  - name: Inspect
   - name: Models
-  - name: NumberType
-    description: ''
-  - name: ObjectType
-    description: ''
-  - name: OptimizerConfig
-    description: ''
-  - name: OptimizerType
-    description: ''
-  - name: PaginatedRowsResult
-    description: ''
-  - name: ParamType
-    description: ''
   - name: PostTraining (Coming Soon)
-  - name: PostTrainingJob
-    description: ''
-  - name: PostTrainingJobArtifactsResponse
-    description: Artifacts of a finetuning job.
-  - name: PostTrainingJobStatusResponse
-    description: Status of a finetuning job.
-  - name: PreferenceOptimizeRequest
-    description: ''
-  - name: ProviderInfo
-    description: ''
-  - name: QATFinetuningConfig
-    description: ''
-  - name: QueryChunksRequest
-    description: ''
-  - name: QueryChunksResponse
-    description: ''
-  - name: QueryCondition
-    description: ''
-  - name: QueryConditionOp
-    description: ''
-  - name: QueryRequest
-    description: ''
-  - name: QuerySpanTreeResponse
-    description: ''
-  - name: QuerySpansResponse
-    description: ''
-  - name: QueryTracesResponse
-    description: ''
-  - name: RAGDocument
-    description: ''
-  - name: RAGQueryConfig
-    description: ''
-  - name: RAGQueryGeneratorConfig
-    description: ''
-  - name: RAGQueryResult
-    description: ''
-  - name: RegexParserScoringFnParams
-    description: ''
-  - name: RegisterDatasetRequest
-    description: ''
-  - name: RegisterEvalTaskRequest
-    description: ''
-  - name: RegisterModelRequest
-    description: ''
-  - name: RegisterScoringFunctionRequest
-    description: ''
-  - name: RegisterShieldRequest
-    description: ''
-  - name: RegisterToolGroupRequest
-    description: ''
-  - name: RegisterVectorDbRequest
-    description: ''
-  - name: ResponseFormat
-    description: ''
-  - name: RouteInfo
-    description: ''
-  - name: RunEvalRequest
-    description: ''
-  - name: RunShieldRequest
-    description: ''
-  - name: RunShieldResponse
-    description: ''
   - name: Safety
-  - name: SafetyViolation
-    description: ''
-  - name: SamplingParams
-    description: ''
-  - name: SamplingStrategy
-    description: ''
-  - name: SaveSpansToDatasetRequest
-    description: ''
-  - name: ScoreBatchRequest
-    description: ''
-  - name: ScoreBatchResponse
-    description: ''
-  - name: ScoreRequest
-    description: ''
-  - name: ScoreResponse
-    description: ''
   - name: Scoring
-  - name: ScoringFn
-    description: ''
-  - name: ScoringFnParams
-    description: ''
   - name: ScoringFunctions
-  - name: ScoringResult
-    description: ''
-  - name: Session
-    description: >-
-      A single session of an interaction with an Agentic System.
-  - name: Shield
-    description: >-
-      A safety shield resource that can be used to check content
-  - name: ShieldCallStep
-    description: ''
   - name: Shields
-  - name: Span
-    description: ''
-  - name: SpanEndPayload
-    description: ''
-  - name: SpanStartPayload
-    description: ''
-  - name: SpanStatus
-    description: ''
-  - name: SpanWithStatus
-    description: ''
-  - name: StringType
-    description: ''
-  - name: StructuredLogEvent
-    description: ''
-  - name: StructuredLogPayload
-    description: ''
-  - name: SupervisedFineTuneRequest
-    description: ''
-  - name: SyntheticDataGenerateRequest
-    description: ''
   - name: SyntheticDataGeneration (Coming Soon)
-  - name: SyntheticDataGenerationResponse
-    description: >-
-      Response from the synthetic data generation. Batch of (prompt, response, score)
-      tuples that pass the threshold.
-  - name: SystemMessage
-    description: >-
-      A system message providing instructions or context to the model.
   - name: Telemetry
-  - name: TextContentItem
-    description: ''
-  - name: TextDelta
-    description: ''
-  - name: TokenLogProbs
-    description: Log probabilities for generated tokens.
-  - name: Tool
-    description: ''
-  - name: ToolCall
-    description: ''
-  - name: ToolCallDelta
-    description: ''
-  - name: ToolDef
-    description: ''
-  - name: ToolDefinition
-    description: ''
-  - name: ToolExecutionStep
-    description: ''
-  - name: ToolGroup
-    description: ''
   - name: ToolGroups
-  - name: ToolHost
-    description: ''
-  - name: ToolInvocationResult
-    description: ''
-  - name: ToolParamDefinition
-    description: ''
-  - name: ToolParameter
-    description: ''
-  - name: ToolResponse
-    description: ''
-  - name: ToolResponseMessage
-    description: >-
-      A message representing the result of a tool invocation.
   - name: ToolRuntime
-  - name: TopKSamplingStrategy
-    description: ''
-  - name: TopPSamplingStrategy
-    description: ''
-  - name: Trace
-    description: ''
-  - name: TrainingConfig
-    description: ''
-  - name: Turn
-    description: >-
-      A single turn in an interaction with an Agentic System.
-  - name: URL
-    description: ''
-  - name: UnionType
-    description: ''
-  - name: UnstructuredLogEvent
-    description: ''
-  - name: UserMessage
-    description: >-
-      A message from the user in a chat conversation.
-  - name: VectorDB
-    description: ''
   - name: VectorDBs
   - name: VectorIO
-  - name: VersionInfo
-    description: ''
-  - name: ViolationLevel
-    description: ''
 x-tagGroups:
   - name: Operations
     tags:
@@ -5333,190 +4972,3 @@ x-tagGroups:
       - ToolRuntime
       - VectorDBs
       - VectorIO
-  - name: Types
-    tags:
-      - AgentCandidate
-      - AgentConfig
-      - AgentCreateResponse
-      - AgentSessionCreateResponse
-      - AgentStepResponse
-      - AgentTool
-      - AgentTurnInputType
-      - AgentTurnResponseEvent
-      - AgentTurnResponseEventPayload
-      - AgentTurnResponseStepCompletePayload
-      - AgentTurnResponseStepProgressPayload
-      - AgentTurnResponseStepStartPayload
-      - AgentTurnResponseStreamChunk
-      - AgentTurnResponseTurnCompletePayload
-      - AgentTurnResponseTurnStartPayload
-      - AggregationFunctionType
-      - AlgorithmConfig
-      - AppEvalTaskConfig
-      - AppendRowsRequest
-      - ArrayType
-      - BasicScoringFnParams
-      - BatchChatCompletionRequest
-      - BatchChatCompletionResponse
-      - BatchCompletionRequest
-      - BatchCompletionResponse
-      - BenchmarkEvalTaskConfig
-      - BooleanType
-      - CancelTrainingJobRequest
-      - ChatCompletionInputType
-      - ChatCompletionRequest
-      - ChatCompletionResponse
-      - ChatCompletionResponseEvent
-      - ChatCompletionResponseStreamChunk
-      - Checkpoint
-      - CompletionInputType
-      - CompletionMessage
-      - CompletionRequest
-      - CompletionResponse
-      - CompletionResponseStreamChunk
-      - ContentDelta
-      - CreateAgentRequest
-      - CreateAgentSessionRequest
-      - CreateAgentTurnRequest
-      - DPOAlignmentConfig
-      - DataConfig
-      - Dataset
-      - DatasetFormat
-      - DefaultRAGQueryGeneratorConfig
-      - EfficiencyConfig
-      - EmbeddingsRequest
-      - EmbeddingsResponse
-      - EvalCandidate
-      - EvalTask
-      - EvalTaskConfig
-      - EvaluateResponse
-      - EvaluateRowsRequest
-      - Event
-      - GrammarResponseFormat
-      - GreedySamplingStrategy
-      - HealthInfo
-      - ImageContentItem
-      - ImageDelta
-      - InferenceStep
-      - InsertChunksRequest
-      - InsertRequest
-      - InterleavedContent
-      - InterleavedContentItem
-      - InvokeToolRequest
-      - Job
-      - JobStatus
-      - JsonSchemaResponseFormat
-      - JsonType
-      - LLMAsJudgeScoringFnParams
-      - LLMRAGQueryGeneratorConfig
-      - ListDatasetsResponse
-      - ListEvalTasksResponse
-      - ListModelsResponse
-      - ListPostTrainingJobsResponse
-      - ListProvidersResponse
-      - ListRoutesResponse
-      - ListScoringFunctionsResponse
-      - ListShieldsResponse
-      - ListToolGroupsResponse
-      - ListToolsResponse
-      - ListVectorDBsResponse
-      - LogEventRequest
-      - LogSeverity
-      - LoraFinetuningConfig
-      - MemoryRetrievalStep
-      - Message
-      - MetricEvent
-      - Model
-      - ModelCandidate
-      - ModelType
-      - NumberType
-      - ObjectType
-      - OptimizerConfig
-      - OptimizerType
-      - PaginatedRowsResult
-      - ParamType
-      - PostTrainingJob
-      - PostTrainingJobArtifactsResponse
-      - PostTrainingJobStatusResponse
-      - PreferenceOptimizeRequest
-      - ProviderInfo
-      - QATFinetuningConfig
-      - QueryChunksRequest
-      - QueryChunksResponse
-      - QueryCondition
-      - QueryConditionOp
-      - QueryRequest
-      - QuerySpanTreeResponse
-      - QuerySpansResponse
-      - QueryTracesResponse
-      - RAGDocument
-      - RAGQueryConfig
-      - RAGQueryGeneratorConfig
-      - RAGQueryResult
-      - RegexParserScoringFnParams
-      - RegisterDatasetRequest
-      - RegisterEvalTaskRequest
-      - RegisterModelRequest
-      - RegisterScoringFunctionRequest
-      - RegisterShieldRequest
-      - RegisterToolGroupRequest
-      - RegisterVectorDbRequest
-      - ResponseFormat
-      - RouteInfo
-      - RunEvalRequest
-      - RunShieldRequest
-      - RunShieldResponse
-      - SafetyViolation
-      - SamplingParams
-      - SamplingStrategy
-      - SaveSpansToDatasetRequest
-      - ScoreBatchRequest
-      - ScoreBatchResponse
-      - ScoreRequest
-      - ScoreResponse
-      - ScoringFn
-      - ScoringFnParams
-      - ScoringResult
-      - Session
-      - Shield
-      - ShieldCallStep
-      - Span
-      - SpanEndPayload
-      - SpanStartPayload
-      - SpanStatus
-      - SpanWithStatus
-      - StringType
-      - StructuredLogEvent
-      - StructuredLogPayload
-      - SupervisedFineTuneRequest
-      - SyntheticDataGenerateRequest
-      - SyntheticDataGenerationResponse
-      - SystemMessage
-      - TextContentItem
-      - TextDelta
-      - TokenLogProbs
-      - Tool
-      - ToolCall
-      - ToolCallDelta
-      - ToolDef
-      - ToolDefinition
-      - ToolExecutionStep
-      - ToolGroup
-      - ToolHost
-      - ToolInvocationResult
-      - ToolParamDefinition
-      - ToolParameter
-      - ToolResponse
-      - ToolResponseMessage
-      - TopKSamplingStrategy
-      - TopPSamplingStrategy
-      - Trace
-      - TrainingConfig
-      - Turn
-      - URL
-      - UnionType
-      - UnstructuredLogEvent
-      - UserMessage
-      - VectorDB
-      - VersionInfo
-      - ViolationLevel
diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py
index d41abc8461..68eecaccb2 100644
--- a/llama_stack/apis/agents/agents.py
+++ b/llama_stack/apis/agents/agents.py
@@ -297,6 +297,16 @@ class AgentStepResponse(BaseModel):
 @runtime_checkable
 @trace_protocol
 class Agents(Protocol):
+    """Agents API for creating and interacting with agentic systems.
+
+    Main functionalities provided by this API:
+    - Create agents with specific instructions and ability to use tools.
+    - Interactions with agents are grouped into sessions ("threads"), and each interaction is called a "turn".
+    - Agents can be provided with various tools (see the ToolGroups and ToolRuntime APIs for more details).
+    - Agents can be provided with various shields (see the Safety API for more details).
+    - Agents can also use Memory to retrieve information from knowledge bases. See the RAG Tool and Vector IO APIs for more details.
+    """
+
     @webmethod(route="/agents", method="POST")
     async def create_agent(
         self,
diff --git a/llama_stack/apis/inference/inference.py b/llama_stack/apis/inference/inference.py
index a163943ebc..2debce1a7e 100644
--- a/llama_stack/apis/inference/inference.py
+++ b/llama_stack/apis/inference/inference.py
@@ -362,6 +362,13 @@ def get_model(self, identifier: str) -> Model: ...
 @runtime_checkable
 @trace_protocol
 class Inference(Protocol):
+    """Llama Stack Inference API for generating completions, chat completions, and embeddings.
+
+    This API provides the raw interface to the underlying models. Two kinds of models are supported:
+    - LLM models: these models generate "raw" and "chat" (conversational) completions.
+    - Embedding models: these models generate embeddings to be used for semantic search.
+    """
+
     model_store: ModelStore
 
     @webmethod(route="/inference/completion", method="POST")

From a2fa8d8b296c5db1dc2c8f0d1cf61577aefffb9e Mon Sep 17 00:00:00 2001
From: Ashwin Bharambe <ashwin.bharambe@gmail.com>
Date: Wed, 29 Jan 2025 09:59:21 -0800
Subject: [PATCH 5/5] Fix library client

---
 llama_stack/distribution/library_client.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_stack/distribution/library_client.py b/llama_stack/distribution/library_client.py
index b2b290c660..fc9ee816cf 100644
--- a/llama_stack/distribution/library_client.py
+++ b/llama_stack/distribution/library_client.py
@@ -339,7 +339,7 @@ async def _call_non_streaming(
                 method=options.method,
                 url=options.url,
                 params=options.params,
-                headers=options.headers,
+                headers=options.headers or {},
                 json=options.json_data,
             ),
         )
@@ -388,7 +388,7 @@ async def gen():
                 method=options.method,
                 url=options.url,
                 params=options.params,
-                headers=options.headers,
+                headers=options.headers or {},
                 json=options.json_data,
             ),
         )